Civilpy - 文章

2023-5-5 Python requests实现知乎问题评论采集

发布时间：2023-05-06 付费文章：9.9元

Base

基础知识

Civil

土木分类资料

Python

Python编程学习

Tools

自媒体效率工具

如题，很早之前写过一个虫子失效了，所以重新整理了一下。

具体功能实现：单个问题或多个问题评论采集+保存为csv或存到mongoDB，可选。

目的：文本二创为视频。

代码：【打赏后获取完整代码压缩包】

import json
from json import JSONDecodeError
import pandas as pd
import pymongo
import requests
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from tqdm import tqdm
from DataTools import *


class Get_answers_of_question_from_API:
    def __init__(self, proxies=None, MongoDB=True):
        # para init
        self.MongoDB = MongoDB
        self.mycol = None
        self.collectionName = None
        self.page = None
        self.baseUrl = None
        self.proxies = proxies
        self.data = None
        self.data_json = None
        self.questions_id = None
        self.Url = None
        # base url set
        self.set_base_url('baseUrl.txt')
        # drive init
        options = webdriver.ChromeOptions()
#         options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
        self.driver = webdriver.Chrome(options=options)

    def MongoClient(self):
        # 连接数据库
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        mydb = myclient["ZhiHu"]  # 数据库名称
        self.mycol = mydb[self.collectionName]  # 集合（表）

    def set_base_url(self, baseUrlFileName):
        """
        API base url, if u know question_id, can use API to get data
        :param baseUrlFileName: baseUrl.txt
        :return: 请求头header,dict
        """
        with open(baseUrlFileName, 'rt', encoding='utf-8') as f:
            self.baseUrl = f.read().strip()

    def get_json_data_from_Url(self):
        """
        :return:
        """
        # response = requests.get(self.Url)
        # self.data_json = json.loads(response.content)
        self.driver.get(self.Url)
        html = self.driver.page_source
        content = BeautifulSoup(html,'html.parser').body.text.encode('gbk', 'ignore').decode('gbk')
        try:
            self.data_json = json.loads(content, strict=False)
        except JSONDecodeError as e:
            pass
        try:
            if len(self.data_json['data']) > 0:
                self.get_item_answer_json_data()
        except KeyError as e:
            print("请完成验证后键入 y")
            input()
            self.get_json_data_from_Url()

    def main_get(self, questions_list):
        for self.questions_id in questions_list:
            # collection name setting
            if not self.collectionName:
                self.collectionName = self.questions_id + '_API'
            # mongo or csv init
            if self.MongoDB:
                self.MongoClient()
            else:
                self.mycol = CsvClient(subFolder='outData', FileName=self.collectionName)
            # get total answer num
            self.driver.get('https://www.zhihu.com/question/{}'.format(self.questions_id))
            total_answer = self.driver.find_element(By.CLASS_NAME, "List-headerText").text.split(' ')[0]
            total_answer = int(total_answer.replace(",", ""))
            with tqdm(total=total_answer) as self.pbar:
                # get base url of question: pages 1
                self.Url = self.baseUrl.format(self.questions_id)
                self.pbar.set_description("question:{}-page:{}".format(self.questions_id, 1))
                # rolling get data
                while True:
                    self.get_json_data_from_Url()
                    if not self.data_json['paging']['is_end']:
                        # update Url and page number
                        self.Url = self.data_json['paging']['next']
                        page = self.data_json['paging']['page']
                        self.pbar.set_description("question:{}-page:{}".format(self.questions_id, page))
                    else:
                        break

如忘记保存，或后续再查看,可凭"订单号" 手动获取

打赏9.9元

手机端：用系统浏览器访问本链接，打开支付宝完成打赏

说明

电脑端，使用手机支付宝直接扫码，完成打赏，点击“直接获取”按钮获取资料。“手动获取”时，注意订单号的格式（参见右侧“订单号示例”）。