2023-5-5 Python requests实现知乎问题评论采集

发布时间:2023-05-06 付费文章:9.9元

如题,很早之前写过一个虫子失效了,所以重新整理了一下。

具体功能实现:单个问题或多个问题评论采集+保存为csv或存到mongoDB,可选。

目的:文本二创为视频。

代码:【打赏后获取完整代码压缩包】

 

import json
from json import JSONDecodeError
import pandas as pd
import pymongo
import requests
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from tqdm import tqdm
from DataTools import *


class Get_answers_of_question_from_API:
    def __init__(self, proxies=None, MongoDB=True):
        # para init
        self.MongoDB = MongoDB
        self.mycol = None
        self.collectionName = None
        self.page = None
        self.baseUrl = None
        self.proxies = proxies
        self.data = None
        self.data_json = None
        self.questions_id = None
        self.Url = None
        # base url set
        self.set_base_url('baseUrl.txt')
        # drive init
        options = webdriver.ChromeOptions()
#         options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
        self.driver = webdriver.Chrome(options=options)

    def MongoClient(self):
        # 连接数据库
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        mydb = myclient["ZhiHu"]  # 数据库名称
        self.mycol = mydb[self.collectionName]  # 集合(表)

    def set_base_url(self, baseUrlFileName):
        """
        API base url, if u know question_id, can use API to get data
        :param baseUrlFileName: baseUrl.txt
        :return: 请求头header,dict
        """
        with open(baseUrlFileName, 'rt', encoding='utf-8') as f:
            self.baseUrl = f.read().strip()

    def get_json_data_from_Url(self):
        """
        :return:
        """
        # response = requests.get(self.Url)
        # self.data_json = json.loads(response.content)
        self.driver.get(self.Url)
        html = self.driver.page_source
        content = BeautifulSoup(html,'html.parser').body.text.encode('gbk', 'ignore').decode('gbk')
        try:
            self.data_json = json.loads(content, strict=False)
        except JSONDecodeError as e:
            pass
        try:
            if len(self.data_json['data']) > 0:
                self.get_item_answer_json_data()
        except KeyError as e:
            print("请完成验证后键入 y")
            input()
            self.get_json_data_from_Url()

    def main_get(self, questions_list):
        for self.questions_id in questions_list:
            # collection name setting
            if not self.collectionName:
                self.collectionName = self.questions_id + '_API'
            # mongo or csv init
            if self.MongoDB:
                self.MongoClient()
            else:
                self.mycol = CsvClient(subFolder='outData', FileName=self.collectionName)
            # get total answer num
            self.driver.get('https://www.zhihu.com/question/{}'.format(self.questions_id))
            total_answer = self.driver.find_element(By.CLASS_NAME, "List-headerText").text.split(' ')[0]
            total_answer = int(total_answer.replace(",", ""))
            with tqdm(total=total_answer) as self.pbar:
                # get base url of question: pages 1
                self.Url = self.baseUrl.format(self.questions_id)
                self.pbar.set_description("question:{}-page:{}".format(self.questions_id, 1))
                # rolling get data
                while True:
                    self.get_json_data_from_Url()
                    if not self.data_json['paging']['is_end']:
                        # update Url and page number
                        self.Url = self.data_json['paging']['next']
                        page = self.data_json['paging']['page']
                        self.pbar.set_description("question:{}-page:{}".format(self.questions_id, page))
                    else:
                        break

注1:支付宝扫下图绿码打赏后,再点击 直接获取↑

注2:如忘记保存或后续查看,可凭订单号 手动获取