Fork me on GitHub
0%

爬虫代码分享

Welcome to my page! 本篇将介绍几种博主曾经撰写的爬虫代码,原创代码欢迎参考

1.知乎个人主页爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import csv
import requests
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd
import re
import time
from bs4 import BeautifulSoup
# 进入浏览器设置
options = Options()
# 设置中文
options.add_argument('lang=zh_CN.UTF-8')
options.add_experimental_option("excludeSwitches",['enable-automation'])
# 更换头部
options.add_argument(
'user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
# get直接返回,不再等待界面加载完成
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"

# 设置Service对象
service = Service("D:/anaconda/Scripts/chromedriver.exe")

# 初始化WebDriver
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)
url = "https://www.zhihu.com/people/ta-mei-yu-zhi-jian-de-wen-rou/answers"
driver.get(url)
time.sleep(10)
page_sourse = driver.page_source

#正则化处理提取回答链接
a = '<a target="_blank" data-za-detail-view-element_name="Title" href="(.*?)">'
#正则化处理提取回答问题名和回答者id
a2='<meta itemprop="name" content="(.*?)">'
#正则化处理提取回答赞同数
a3='<span><button aria-label="(.*?)" aria-live="polite" type="button" class="Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte">'
#正则化处理提取回答时间
a5='<meta itemprop="dateModified" content="(.*?)">'
a = re.findall(a, page_sourse)
a2 = re.findall(a2, page_sourse)
a3=re.findall(a3,page_sourse)
a5=re.findall(a5,page_sourse)
question_names=a2[0::2]
author_ids=a2[1::2]
time=a5
answer_ids = [re.search(r'/answer/(\d+)', url).group(1) for url in a]
driver.quit()
# 通过下标访问数组中的元素
for i in range(len(answer_ids)):
print(f"ID[{i}] = {answer_ids[i]}")#输出回答id


url = [(index, line.lstrip('//')) for index, line in enumerate(a, 1)]

Line=[]#回答链接列表
answercontent=[]#回答内容列表
# 打印结果
for index, line in url:
Line.append(line)

driver.quit()


# 定义保存结果的文件名
result_file = 'zhihu_comments.csv'

# 定义空的DataFrame对象,用来存储所有的评论数据
df_all = pd.DataFrame()
comment_number=[]
# 循环遍历所有的回答id,通过第二种爬取方式获得评论内容
for i in range(len(answer_ids)):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}

# 定义目标回答的id
answer_id = answer_ids[i]

# 定义存储数据的列表
comments = []

# 定义初始的偏移量
offset = 0

# 定义循环标志


# 构造请求url,根据偏移量和回答id

url = f'https://www.zhihu.com/api/v4/answers/{answer_id}/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvote_count%2Cis_parent_author%2Cis_author&order=normal&limit=20&offset={offset}&status=open'
# 发送请求,获取响应
response = requests.get(url, headers=headers)
# 判断响应状态码是否为200
if response.status_code == 200:
# 解析响应数据为json格式
data = response.json()
# 获取评论列表
comments_list = data['data']
comment_count = data['paging']['totals']
# 判断评论列表是否为空
if comments_list:
# 遍历评论列表,提取需要的字段
for comment in comments_list:
# 获取评论id
comment_id = comment['id']
# 获取评论内容
content = comment['content']
# 获取评论作者的姓名
author_name = comment['author']['member']['name']
# 获取评论作者的性别,0为女,1为男,-1为未知
author_gender = comment['author']['member']['gender']
# 获取评论的点赞数
vote_count = comment['vote_count']
# 获取评论的创建时间,为10位时间戳
created_time = datetime.utcfromtimestamp(comment['created_time'])#时间处理函数,将时间戳处理为24小时制时间
# 获取评论id属地
address_text = comment['address_text']
#获取评论者id
author_id = comment['author']['member']['id']
# 将提取的字段组成一个字典,添加到数据列表中
comments.append({
'回答id':answer_ids[i],
'评论id': comment_id,
'评论内容': content,
'昵称': author_name,
'性别': author_gender,
'点赞数': vote_count,
'评论时间': created_time,
'ip属地': address_text,
'用户id':author_id
})


print(f'已爬取{comment_count}条评论')
comment_number.append(comment_count)
else:
# 如果评论列表为空,说明没有更多数据,结束循环
flag = False
comment_count=0
comment_number.append(comment_count)
print(f'已爬取{comment_count}条评论')
else:
# 如果响应状态码不为200,说明请求出错,结束循环
flag = False
print('请求失败')

print(f'第{i+1}个问题爬取完成')
# 将每次爬取的评论数据转换为DataFrame对象
df = pd.DataFrame(comments)
# 使用concat函数将其拼接到总的DataFrame对象中
df_all = pd.concat([df_all, df], ignore_index=True)

if i==len(answer_ids)-1:
for i in range(len(answer_ids)):
url1 = f'https://{Line[i]}'
response1 = requests.get(url1, headers=headers)

soup = BeautifulSoup(response1.text, 'html.parser')
answer_content = [p.get_text() for p in soup.select('span.RichText.ztext.CopyrightRichText-richText.css-olurbu p')]
answercontent.append(answer_content)
# 将总的DataFrame对象一次性保存为csv文件
df_all.to_csv(result_file, index=False, encoding='utf-8-sig')

# 打印保存结果
print(f'评论内容已保存为{result_file}')
data=list(zip(answer_ids,a,question_names,author_ids,a3,time,comment_number,answercontent))
with open('answer.csv','w',newline='',encoding='utf-8')as csvfile:
csv_writer=csv.writer(csvfile)
csv_writer.writerow(['回答id','回答网址','问答问题名','回答者昵称','赞同数','回答时间','评论数量','回答内容'])
csv_writer.writerows(data)
print(f'博主回答内容已保存为answer.csv')

2.京东商铺爬取

1
2
3
from DrissionPage import ChromiumOptions
path =r'"C:/Program Files/Google/Chrome/Application/chrome.exe"'
ChromiumOptions().set_browser_path(path).save()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from DrissionPage import ChromiumPage
from DrissionPage.common import Keys
import time
import csv
f=open('data2.csv',mode='w',encoding='utf-8',newline='')
csv_writer=csv.DictWriter(f,fieldnames=[
'标题',
'价格',
'链接',])
csv_writer.writeheader()

dp=ChromiumPage()
dp.get('https://www.jd.com/')
dp.ele('css:#key').input('郫县豆瓣酱')
dp.ele('css:#key').input(Keys.ENTER)


for page in range(1,31):
print('已经爬取到了第'+str(page)+'页\n')
dp.scroll.to_bottom()
lis=dp.eles('css:.gl-item')
for li in lis:
time.sleep(2)
try:
#dp.scroll.down(50)
title=li.ele('css:.p-name a em').text
price=li.ele('css:.p-price i').text
href=li.ele('css:.p-name a').attr('href')
'''li.ele('css:.p-name a em').click() #进入新页面
tabs=dp.get_tabs()
tab=tabs[-1]
img=tab.ele('css:#spec-img').attr('src')
comment=tab.ele('css:.comment-con').text
tab.close()'''
dict={
'标题':title,
'价格':price,
'链接':href,
}
csv_writer.writerow(dict)
print(dict)
except:
pass
lis[0].input(Keys.RIGHT)

3.京东评论爬取(可接2使用)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from DrissionPage import ChromiumPage
import pandas as pd
import time
import csv

# 使用 `with open` 自动管理文件
with open('data4.csv', mode='w', encoding='utf-8', newline='') as f:
csv_writer = csv.DictWriter(f, fieldnames=['名称', '评论', '购买时间'])
csv_writer.writeheader()

dp = ChromiumPage()
df = pd.read_csv('data2.csv')

if '链接' in df.columns:
for url in df['链接']:
try:
dp.get(url)
time.sleep(2)
# 等待页面主要内容加载,选取一个确定会加载的元素
#dp.wait.ele('css:#detail', timeout=10)

# 滚动到底部,提高评论加载可能性
dp.scroll.to_bottom()

# 确保“评论”选项卡可用后再点击
tab_element = dp.ele('css:#detail > div.tab-main.large > ul > li.current', timeout=5)
if tab_element:
tab_element.click()

# 监听网络请求,设置超时避免卡住
dp.listen.start('pc_club_productPageComments')
r = dp.listen.wait(timeout=10)

if r:
json_data = r.response.body
print(json_data)

comments = json_data.get('comments', [])
for index in comments:
row_data = {
'名称': index.get('nickname', 'N/A'),
'评论': index.get('content', 'N/A'),
'购买时间': index.get('referenceTime', 'N/A'),
}
csv_writer.writerow(row_data)

except Exception as e:
print(f"处理 {url} 时出错: {e}")

欢迎关注我的其它发布渠道