牛客高级项目课（12）

科技2024-07-01 69

使用python爬虫爬取知乎和v2ex数据，充实网站信息安装python3.x并且配置环境变量。同时安装pycharm,安装pip。安装好以后，先熟悉python的语法，写一些例子，比如数据类型，操作符，方法调用，以及面向对象的技术。因为数据是要导入数据库的，所以这里安装MySQLdb的一个库，并且写一下连接数据库的代码，写一下简单的crud进行测试。使用requests库作为解析http请求的工具，使用beautifulsoup作为解析html代码的工具，请求之后直接使用css选择器匹配。即可获得内容。当然现在我们有更方便的工具pyspider，可以方便解析请求并且可以设置代理，伪装身份等，直接传入url并且写好多级的解析函数，程序便会迭代执行，直到把所有页面的内容解析出来。这里我们直接启动pyspider的web应用并且写好python代码，就可以执行爬虫了。把解析出的东西

#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-08-17 11:11:46 # Project: v2ex from pyspider.libs.base_handler import * import re import random import MySQLdb class Handler(BaseHandler): crawl_config = { } def __init__(self): self.db = MySQLdb.connect('localhost', 'root', 'nowcoder', 'wenda', charset='utf8') def add_question(self, title, content): try: cursor = self.db.cursor() sql = 'insert into question(title, content, user_id, created_date, comment_count) values ("%s","%s",%d, %s, 0)' % (title, content, random.randint(1, 10) , 'now()'); print sql cursor.execute(sql) self.db.commit() except Exception, e: print e self.db.rollback() @every(minutes=24 * 60) def on_start(self): self.crawl('http://v2ex.com', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http://v2ex.com/?tab="]').items(): self.crawl(each.attr.href, callback=self.tab_page) @config(age=10 * 24 * 60 * 60) def tab_page(self, response): for each in response.doc('a[href^="http://v2ex.com/go/"]').items(): self.crawl(each.attr.href, callback=self.board_page) @config(priority=2) def board_page(self, response): for each in response.doc('a[href^="http://v2ex.com/t/"]').items(): url = each.attr.href if url.find('#reply') > 0: url = url[0:url.find('#')] self.crawl(url, callback=self.detail_page) for each in response.doc('a.page_normal').items(): self.crawl(each.attr.href, callback=self.board_page) @config(priority=20) def detail_page(self, response): title = response.doc('h1').text() content = response.doc('div.topic_content').html() self.add_question(title, content) return { 'url': response.url, 'title': title, 'content': content }

知乎：先找到问题，再把问题下所有的回答进行爬取，最后把问题和评论一起处理。

#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-08-19 14:21:53 # Project: zhihu from pyspider.libs.base_handler import * import random import MySQLdb class Handler(BaseHandler): crawl_config = { 'itag': 'v1', 'headers': { 'User-Agent': 'GoogleBot', 'Host' : 'www.zhihu.com', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', } } def __init__(self): self.db = MySQLdb.connect('localhost', 'root', 'nowcoder', 'wenda', charset='utf8') def add_question(self, title, content, comment_count): try: cursor = self.db.cursor() sql = 'insert into question(title, content, user_id, created_date, comment_count) values ("%s","%s",%d, %s, %d)' % (title, content, random.randint(1, 10) , 'now()', comment_count); #print sql cursor.execute(sql) qid = cursor.lastrowid self.db.commit() return qid except Exception, e: print e self.db.rollback() return 0 def add_comment(self, qid, comment): try: cursor = self.db.cursor() sql = 'insert into comment(content, entity_type, entity_id, user_id, created_date) values ("%s",%d,%d, %d,%s)' % (comment, 1, qid, random.randint(1, 10) , 'now()'); #print sql cursor.execute(sql) self.db.commit() except Exception, e: print e self.db.rollback() @every(minutes=24 * 60) def on_start(self): self.crawl('https://www.zhihu.com/topic/19554298/top-answers?page=3', callback=self.index_page, validate_cert=False) self.crawl('https://www.zhihu.com/topic/19552330/top-answers?page=3', callback=self.index_page, validate_cert=False) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a.question_link').items(): self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False) for each in response.doc('.zm-invite-pager span a').items(): self.crawl(each.attr.href, callback=self.index_page, validate_cert=False) @config(priority=2) def detail_page(self, response): items = response.doc('div.zm-editable-content.clearfix').items() title = response.doc('span.zm-editable-content').text() html = response.doc('#zh-question-detail .zm-editable-content').html() if html == None: html = response.doc('#zh-question-detail .content.hidden').html() if html == None: html = '' content = html.replace('"', '\\"') print content qid = self.add_question(title, content, sum(1 for x in items)) for each in response.doc('div.zm-editable-content.clearfix').items(): self.add_comment(qid, each.html().replace('"', '\\"')) return { "url": response.url, "title": title, "content": content, }

使用solr搭建全文搜索引擎，开发知乎的全文搜索功能 solr是一个成熟的全文搜索引擎工具，底层是Lucene实现，主要是java语言写的。下载solr6.2。完成solr环境搭建，简单测试多副本部署和单机部署。 solr默认英文分词，需要加入中文分词工具IK-Analyzer。 solr中一个core代表一个全文搜索集，我们可以在server文件夹中找到我们创建的core。然后根据需要修改conf里的配置文件，首先修改managed-schema来设置分词规则，我们在此加入中文分词类型，并且配置其索引分词和查询分词，此处需要引入IK-Analyzer的jar包，jar包可以通过maven项目打包而获得。索引分词指的是建立索引使用的分词，比如你好北京，可以分为你你好北京北等情况。而查询分词是根据需求进行查询时的分词，可以分为你好北京。为了通过数据库向solr导入数据，我们需要配置数据导入处理器，这是需要修改solrconfig文件来配置数据导入处理器，并且在solr-data-config中配置本地数据库地址，这样就可以在solr的web页面中进行数据库导入了。导入之后自动建立索引，我们就可以使用solr来对数据库进行全文搜索了。比如mysql数据库进行普通搜索，把数据导入solr进行全文搜索。开发搜索相关功能，开发service并且接入solr的api，从而连接本机的solr服务并且执行查询和索引操作。只需要指定关键字，以及我们要搜索的栏目（本例中主要title和content，所以传入这两个字段，并且在搜索结果中加亮关键字。开发相关controller以及页面。并且在新增问题以后执行异步事件，将新增的问题数据加入solr的数据库中，以便后面进行搜索。

Processed: 0.022, SQL: 8