1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
|
import requests import re from common import handleDB
class Crawler: """定义一个爬虫类""" def __init__(self): self.db = handleDB.HandleMysql()
@staticmethod def get_html(url, header): response = requests.get(url=url, headers=header) if response.status_code == 200: return response.text else: return None
@staticmethod def get_data(html, list_data): pattern = re.compile(r'<dd>.*?<i.*?>(\d+)</i>.*?' r'<p class="name"><a.*?data-val=".*?">(.*?)' r'</a>.*?<p.*?class="releasetime">(.*?)</p>' r'.*?<i.*?"integer">(.*?)</i>' r'.*?<i.*?"fraction">(.*?)</i>.*?</dd>', re.S) m = pattern.findall(html) for i in m: ranking = i[0] movie = i[1] release_time = i[2] score = i[3] + i[4] list_data.append([ranking, movie, release_time, score])
def write_data(self, sql, data): self.db.conn_mysql() try: self.db.execute_sql(sql, data) print('导入成功') except: print('导入失败') self.db.close_mysql()
def run_main(self): start_url = 'http://maoyan.com/board/4' depth = 10 header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "maoyan.com", "Referer": "http://maoyan.com/board", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"}
for i in range(depth): url = start_url + '?offset=' + str(10 * i) html = self.get_html(url, header) list_data = [] self.get_data(html, list_data) for i in list_data: """这里的list_data参数是指正则匹配并处理后的列表数据(是一个大列表,包含所有电影信息,每个电影信息都存在各自的一个列表中; 对大列表进行迭代,提取每组电影信息,这样提取到的每组电影信息都是一个小列表,然后就可以把每组电影信息写入数据库了)""" movie = i sql = "insert into maoyan_movie(ranking,movie,release_time,score) values(%s, %s, %s, %s)" self.write_data(sql, movie)
if __name__ == '__main__': test = Crawler() test.run_main()
|