1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
| import csv import requests from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from datetime import datetime import pandas as pd import re import time from bs4 import BeautifulSoup
options = Options()
options.add_argument('lang=zh_CN.UTF-8') options.add_experimental_option("excludeSwitches",['enable-automation'])
options.add_argument( 'user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
desired_capabilities = DesiredCapabilities.CHROME desired_capabilities["pageLoadStrategy"] = "none"
service = Service("D:/anaconda/Scripts/chromedriver.exe")
driver = webdriver.Chrome(options=options) driver.implicitly_wait(10) url = "https://www.zhihu.com/people/ta-mei-yu-zhi-jian-de-wen-rou/answers" driver.get(url) time.sleep(10) page_sourse = driver.page_source
a = '<a target="_blank" data-za-detail-view-element_name="Title" href="(.*?)">'
a2='<meta itemprop="name" content="(.*?)">'
a3='<span><button aria-label="(.*?)" aria-live="polite" type="button" class="Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte">'
a5='<meta itemprop="dateModified" content="(.*?)">' a = re.findall(a, page_sourse) a2 = re.findall(a2, page_sourse) a3=re.findall(a3,page_sourse) a5=re.findall(a5,page_sourse) question_names=a2[0::2] author_ids=a2[1::2] time=a5 answer_ids = [re.search(r'/answer/(\d+)', url).group(1) for url in a] driver.quit()
for i in range(len(answer_ids)): print(f"ID[{i}] = {answer_ids[i]}")
url = [(index, line.lstrip('//')) for index, line in enumerate(a, 1)]
Line=[] answercontent=[]
for index, line in url: Line.append(line)
driver.quit()
result_file = 'zhihu_comments.csv'
df_all = pd.DataFrame() comment_number=[]
for i in range(len(answer_ids)): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36', }
answer_id = answer_ids[i]
comments = []
offset = 0
url = f'https://www.zhihu.com/api/v4/answers/{answer_id}/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvote_count%2Cis_parent_author%2Cis_author&order=normal&limit=20&offset={offset}&status=open' response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() comments_list = data['data'] comment_count = data['paging']['totals'] if comments_list: for comment in comments_list: comment_id = comment['id'] content = comment['content'] author_name = comment['author']['member']['name'] author_gender = comment['author']['member']['gender'] vote_count = comment['vote_count'] created_time = datetime.utcfromtimestamp(comment['created_time']) address_text = comment['address_text'] author_id = comment['author']['member']['id'] comments.append({ '回答id':answer_ids[i], '评论id': comment_id, '评论内容': content, '昵称': author_name, '性别': author_gender, '点赞数': vote_count, '评论时间': created_time, 'ip属地': address_text, '用户id':author_id })
print(f'已爬取{comment_count}条评论') comment_number.append(comment_count) else: flag = False comment_count=0 comment_number.append(comment_count) print(f'已爬取{comment_count}条评论') else: flag = False print('请求失败')
print(f'第{i+1}个问题爬取完成') df = pd.DataFrame(comments) df_all = pd.concat([df_all, df], ignore_index=True)
if i==len(answer_ids)-1: for i in range(len(answer_ids)): url1 = f'https://{Line[i]}' response1 = requests.get(url1, headers=headers)
soup = BeautifulSoup(response1.text, 'html.parser') answer_content = [p.get_text() for p in soup.select('span.RichText.ztext.CopyrightRichText-richText.css-olurbu p')] answercontent.append(answer_content)
df_all.to_csv(result_file, index=False, encoding='utf-8-sig')
print(f'评论内容已保存为{result_file}') data=list(zip(answer_ids,a,question_names,author_ids,a3,time,comment_number,answercontent)) with open('answer.csv','w',newline='',encoding='utf-8')as csvfile: csv_writer=csv.writer(csvfile) csv_writer.writerow(['回答id','回答网址','问答问题名','回答者昵称','赞同数','回答时间','评论数量','回答内容']) csv_writer.writerows(data) print(f'博主回答内容已保存为answer.csv')
|