如题,很早之前写过一个虫子失效了,所以重新整理了一下。
具体功能实现:单个问题或多个问题评论采集+保存为csv或存到mongoDB,可选。
目的:文本二创为视频。
代码:【打赏后获取完整代码压缩包】
import json from json import JSONDecodeError import pandas as pd import pymongo import requests import time from selenium import webdriver from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from tqdm import tqdm from DataTools import * class Get_answers_of_question_from_API: def __init__(self, proxies=None, MongoDB=True): # para init self.MongoDB = MongoDB self.mycol = None self.collectionName = None self.page = None self.baseUrl = None self.proxies = proxies self.data = None self.data_json = None self.questions_id = None self.Url = None # base url set self.set_base_url('baseUrl.txt') # drive init options = webdriver.ChromeOptions() # options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") self.driver = webdriver.Chrome(options=options) def MongoClient(self): # 连接数据库 myclient = pymongo.MongoClient("mongodb://localhost:27017/") mydb = myclient["ZhiHu"] # 数据库名称 self.mycol = mydb[self.collectionName] # 集合(表) def set_base_url(self, baseUrlFileName): """ API base url, if u know question_id, can use API to get data :param baseUrlFileName: baseUrl.txt :return: 请求头header,dict """ with open(baseUrlFileName, 'rt', encoding='utf-8') as f: self.baseUrl = f.read().strip() def get_json_data_from_Url(self): """ :return: """ # response = requests.get(self.Url) # self.data_json = json.loads(response.content) self.driver.get(self.Url) html = self.driver.page_source content = BeautifulSoup(html,'html.parser').body.text.encode('gbk', 'ignore').decode('gbk') try: self.data_json = json.loads(content, strict=False) except JSONDecodeError as e: pass try: if len(self.data_json['data']) > 0: self.get_item_answer_json_data() except KeyError as e: print("请完成验证后键入 y") input() self.get_json_data_from_Url() def main_get(self, questions_list): for self.questions_id in questions_list: # collection name setting if not self.collectionName: self.collectionName = self.questions_id + '_API' # mongo or csv init if self.MongoDB: self.MongoClient() else: self.mycol = CsvClient(subFolder='outData', FileName=self.collectionName) # get total answer num self.driver.get('https://www.zhihu.com/question/{}'.format(self.questions_id)) total_answer = self.driver.find_element(By.CLASS_NAME, "List-headerText").text.split(' ')[0] total_answer = int(total_answer.replace(",", "")) with tqdm(total=total_answer) as self.pbar: # get base url of question: pages 1 self.Url = self.baseUrl.format(self.questions_id) self.pbar.set_description("question:{}-page:{}".format(self.questions_id, 1)) # rolling get data while True: self.get_json_data_from_Url() if not self.data_json['paging']['is_end']: # update Url and page number self.Url = self.data_json['paging']['next'] page = self.data_json['paging']['page'] self.pbar.set_description("question:{}-page:{}".format(self.questions_id, page)) else: break
如忘记保存,或后续再查看,可凭“订单号” 手动获取