1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
|
import requests, json import datetime import pandas as pd from selectolax.parser import HTMLParser
id = 384102529
url = 'https://www.zhihu.com/api/v4/questions/' + str(id) + '/answers' headers = { 'Host': 'www.zhihu.com', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 'referer': 'https://www.zhihu.com/question/' + str(id) } df = pd.DataFrame(columns=('author', 'fans_count', 'content', 'created_time', 'updated_time', 'comment_count', 'voteup_count', 'url'))
def crawler(start): print(start) global df data= { 'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,is_labeled,paid_info,paid_info_content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_recognized;data[*].mark_infos[*].url;data[*].author.follower_count,vip_info,badge[*].topics;data[*].settings.table_of_content.enabled', 'offset': start, 'limit': 20, 'sort_by': 'default', 'platform': 'desktop' }
r = requests.get(url, params=data, headers=headers) res = json.loads(r.text) if res['data']: for answer in res['data']: author = answer['author']['name'] fans = answer['author']['follower_count'] content = HTMLParser(answer['content']).text() created_time = datetime.datetime.fromtimestamp(answer['created_time']) updated_time = datetime.datetime.fromtimestamp(answer['updated_time']) comment = answer['comment_count'] voteup = answer['voteup_count'] link = answer['url']
row = { 'author':[author], 'fans_count':[fans], 'content':[content], 'created_time':[created_time], 'updated_time':[updated_time], 'comment_count':[comment], 'voteup_count':[voteup], 'url':[link] } df = df.append(pd.DataFrame(row), ignore_index=True)
if len(res['data']) == 20: crawler(start + 20) else: print(res)
crawler(0) df.to_excel( f'result_{str(id)}_{datetime.datetime.now().strftime("%Y-%m-%d")}.xlsx', index=False ) print("done~")
|