如何获取知乎问题下的所有回答

刚刚帮同学获取知乎问题下的所有回答,现将代码备份一下。

代码从这里转载而来,并稍加改动。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-

import requests, json
import datetime
import pandas as pd
from selectolax.parser import HTMLParser

# 知乎问题的 id
id = 384102529

url = 'https://www.zhihu.com/api/v4/questions/' + str(id) + '/answers'
headers = {
'Host': 'www.zhihu.com',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'referer': 'https://www.zhihu.com/question/' + str(id)
}
df = pd.DataFrame(columns=('author', 'fans_count', 'content', 'created_time', 'updated_time',
'comment_count', 'voteup_count', 'url'))

def crawler(start):
print(start)
global df
data= {
'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,is_labeled,paid_info,paid_info_content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_recognized;data[*].mark_infos[*].url;data[*].author.follower_count,vip_info,badge[*].topics;data[*].settings.table_of_content.enabled',
'offset': start,
'limit': 20,
'sort_by': 'default',
'platform': 'desktop'
}

# 将携带的参数传给 params
r = requests.get(url, params=data, headers=headers)
res = json.loads(r.text)
if res['data']:
for answer in res['data']:
author = answer['author']['name']
fans = answer['author']['follower_count']
content = HTMLParser(answer['content']).text()
created_time = datetime.datetime.fromtimestamp(answer['created_time'])
updated_time = datetime.datetime.fromtimestamp(answer['updated_time'])
comment = answer['comment_count']
voteup = answer['voteup_count']
link = answer['url']

row = {
'author':[author],
'fans_count':[fans],
'content':[content],
'created_time':[created_time],
'updated_time':[updated_time],
'comment_count':[comment],
'voteup_count':[voteup],
'url':[link]
}
df = df.append(pd.DataFrame(row), ignore_index=True)

if len(res['data']) == 20:
crawler(start + 20)
else:
print(res)

crawler(0)
df.to_excel(
f'result_{str(id)}_{datetime.datetime.now().strftime("%Y-%m-%d")}.xlsx',
index=False
)
print("done~")

Fin.

0条搜索结果。