我是靠谱客的博主 单薄康乃馨,这篇文章主要介绍学习Python爬虫案例-获得豆瓣Top250的电影数据,现在分享给大家,希望可以做个参考。

把豆瓣top250的电影数据爬取到Excel

复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from bs4 import BeautifulSoup # 网页解析,获取数据 import re # 正则表达式,进行文字匹配 import urllib.request, urllib.error # 指定URL,获取网页数据 import xlwt # 进行excel操作 import _sqlite3 # 进行SQLite数据库操作 def main(): baseUrl = "https://movie.douban.com/top250?start=" savePath = '.\豆瓣电影Top250.xls' # 1.爬取网页 # 2.解析数据 dataList = getData(baseUrl) # 3.保存至数据库 saveData(dataList, savePath) ##以下为正则匹配的模板设置 findLink = re.compile(r'<a href="(.*?)">') # 生成电影链接的规则 # 影片的图片 re.S(让换行符包含在字符中) findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) findTitle = re.compile(r'<span class="title">(.*?)</span>') findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') # 评价人数 findJudge = re.compile(r'<span>(d*)人评价</span>') # 找到概况 findInq = re.compile(r'<span class="inq">(.*)</span>', re.S) findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 爬取网页 def getData(baseUrl): dataList = [] for i in range(0, 10): url = baseUrl + str(i * 25) # 通过url请求服务器获得html数据 html = askURL(url) # 逐一解析数据,采用本地解析器 soup = BeautifulSoup(html, "html.parser") for item in soup.find_all('div', class_="item"): # 查找符合要求的字符串,形成列表 data = [] # 保存一部电影的所有信息 item = str(item)#转为字符串 link = re.findall(findLink, item)[0] data.append(link) imgSrc = re.findall(findImgSrc, item)[0] data.append(imgSrc) titles = re.findall(findTitle, item) if (len(titles) == 2): ctitle = titles[0] data.append(ctitle) otitle = titles[1].replace("/", "") data.append(otitle) else: data.append(titles[0]) data.append(' ') rating = re.findall(findRating, item)[0] data.append(rating) judge = re.findall(findJudge, item)[0] data.append(judge) inq = re.findall(findInq, item) if len(inq) != 0: inq = inq[0].replace(".", "") data.append(inq) else: data.append("") bd = re.findall(findBd, item)[0] bd = re.sub('<br(s+)?/>', '', bd) bd = re.sub('/', '', bd) data.append(bd.strip()) dataList.append(data) return dataList # 保存数据 def saveData(dataList, savePath): book = xlwt.Workbook(encoding="utf-8", style_compression=0) sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True) col = ("电影详情链接", "图片链接", "影片中文名", "影片外国名", "评分", "评价数", "概况", "相关信息") for i in range(0, 8): #写入表头 sheet.write(0, i, col[i]) for i in range(0, 250): print("第{0}条".format(str(i + 1))) data = dataList[i] for j in range(0, 8): #写到Excel sheet.write(i + 1, j, data[j]) book.save(savePath) # 得到指定一个URL的网页内容 def askURL(url): head = { #设置代理,模拟为浏览器请求 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) #得到网站的html数据,并进行utf-8解码 html = response.read().decode('utf-8') except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html if __name__ == "__main__": main()

把豆瓣top250的数据爬取后保存到SQLite

复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from bs4 import BeautifulSoup # 网页解析,获取数据 import re # 正则表达式,进行文字匹配 import urllib.request, urllib.error # 指定URL,获取网页数据 import xlwt # 进行excel操作 import _sqlite3 # 进行SQLite数据库操作 def main(): baseUrl = "https://movie.douban.com/top250?start=" savePath = '.\豆瓣电影Top250.xls' dbPath = "movie.db" # 1.爬取网页 # 2.解析数据 dataList = getData(baseUrl) # 3.保存至数据库 # saveData(dataList, savePath) saveDBData(dataList, dbPath) ##以下为正则匹配的模板设置 findLink = re.compile(r'<a href="(.*?)">') # 生成电影链接的规则 # 影片的图片 re.S(让换行符包含在字符中) findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) findTitle = re.compile(r'<span class="title">(.*?)</span>') findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') # 评价人数 findJudge = re.compile(r'<span>(d*)人评价</span>') # 找到概况 findInq = re.compile(r'<span class="inq">(.*)</span>', re.S) findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 爬取网页 def getData(baseUrl): dataList = [] for i in range(0, 10): url = baseUrl + str(i * 25) # 通过url请求服务器获得html数据 html = askURL(url) # 逐一解析数据,采用本地解析器 soup = BeautifulSoup(html, "html.parser") for item in soup.find_all('div', class_="item"): # 查找符合要求的字符串,形成列表 data = [] # 保存一部电影的所有信息 item = str(item) # 转为字符串 link = re.findall(findLink, item)[0] data.append(link) imgSrc = re.findall(findImgSrc, item)[0] data.append(imgSrc) titles = re.findall(findTitle, item) if (len(titles) == 2): ctitle = titles[0] data.append(ctitle) otitle = titles[1].replace("/", "") data.append(otitle) else: data.append(titles[0]) data.append(' ') rating = re.findall(findRating, item)[0] data.append(rating) judge = re.findall(findJudge, item)[0] data.append(judge) inq = re.findall(findInq, item) if len(inq) != 0: inq = inq[0].replace(".", "") data.append(inq) else: data.append("") bd = re.findall(findBd, item)[0] bd = re.sub('<br(s+)?/>', '', bd) bd = re.sub('/', '', bd) data.append(bd.strip()) dataList.append(data) return dataList # 保存数据 def saveData(dataList, savePath): book = xlwt.Workbook(encoding="utf-8", style_compression=0) sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True) col = ("电影详情链接", "图片链接", "影片中文名", "影片外国名", "评分", "评价数", "概况", "相关信息") for i in range(0, 8): # 写入表头 sheet.write(0, i, col[i]) for i in range(0, 250): print("第{0}条".format(str(i + 1))) data = dataList[i] for j in range(0, 8): # 写到Excel sheet.write(i + 1, j, data[j]) book.save(savePath) # 报错到sqllite def saveDBData(dataList, dbPath): # init_db(dbPath) conn = _sqlite3.connect(dbPath) cur = conn.cursor() for data in dataList: for index in range(len(data)): data[index] = '"' + data[index] + '"' sql = """ insert into movie250 ( info_link,pic_link,cname,ename,score,rated,instrodouction,info ) values(%s) """ % ",".join(data) print(sql) cur.execute(sql) conn.commit() cur.close() conn.close() def init_db(dbPath): sql = """ create table movie250( id integer primary key autoincrement, info_link text, pic_link text, cname varchar, ename varchar, score numeric, rated numeric, instrodouction text, info text) """ conn = _sqlite3.connect(dbPath) cursor = conn.cursor() cursor.execute(sql) conn.commit() conn.close() # 得到指定一个URL的网页内容 def askURL(url): head = { # 设置代理,模拟为浏览器请求 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) # 得到网站的html数据,并进行utf-8解码 html = response.read().decode('utf-8') except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html if __name__ == "__main__": main()

最后

以上就是单薄康乃馨最近收集整理的关于学习Python爬虫案例-获得豆瓣Top250的电影数据的全部内容,更多相关学习Python爬虫案例-获得豆瓣Top250内容请搜索靠谱客的其他文章。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(82)

评论列表共有 0 条评论

立即
投稿
返回
顶部