Civilpy - 文章

使用KNN及tensorflow进行中文pdf关键词搜索，类似AutoGPT或ChatPDF实现原理

发布时间：2023-05-05 付费文章：9.9元

Base

基础知识

Civil

土木分类资料

Python

Python编程学习

Tools

自媒体效率工具

接上回，如何使用AI模型（如GPT、LLaMA），训练某一考试的教材、历年试题？

直接上代码，结合该功能与GPT进行搞基：

PDF文本搜索

import os
import re
import shutil
import urllib.request
from pathlib import Path
from tempfile import NamedTemporaryFile

import fitz
import numpy as np
import openai
import tensorflow_hub as hub
from sklearn.neighbors import NearestNeighbors

# 对每页PDF进行预处理，生成一个text_list
def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub('\s+', ' ', text)
    return text


def pdf_to_text(path, start_page=1, end_page=None):
    doc = fitz.open(path)
    total_pages = doc.page_count

    if end_page is None:
        end_page = total_pages

    text_list = []

    for i in range(start_page - 1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append(text)

    doc.close()
    return text_list

def text_to_chunks(texts, word_length=150, start_page=1):
    text_toks = [t.split(' ') for t in texts]
    page_nums = []
    chunks = []

    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i : i + word_length]
            if (
                (i + word_length) > len(words)
                and (len(chunk) < word_length)
                and (len(text_toks) != (idx + 1))
            ):
                text_toks[idx + 1] = chunk + text_toks[idx + 1]
                continue
            chunk = ' '.join(chunk).strip()
            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
#             print({idx+start_page})
            chunks.append(chunk)
    return chunks

class SemanticSearch:
    def __init__(self):
        self.use = hub.load("F:/*******") # 中文
        
        self.fitted = False

    def fit(self, data, batch=100, n_neighbors=3): # batch=1000, n_neighbors=5
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True

    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors

    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i : (i + batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings


def load_recommender(path, start_page=1):
    global recommender
    texts = pdf_to_text(path, start_page=start_page)
    chunks = text_to_chunks(texts, start_page=start_page)
    recommender.fit(chunks) 
    return 'Corpus Loaded.'

# 开始训练语料库
pdf_path='第3章  岩土工程勘察.pdf'
recommender = SemanticSearch()
load_recommender(pdf_path) # 使用fit生成语料库
question='钻孔深度相关规定？'
topn_chunks = recommender(question)
print(topn_chunks)

GPT查询代码

def generate_answer(question, openAI_key):
    topn_chunks = recommender(question)  
    prompt = ""
    prompt += 'search results:\n\n'
    for c in topn_chunks:
        prompt += c + '\n\n'

    prompt += (
        "Instructions: Compose a comprehensive reply to the query using the search results given. "
        "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "
        "Citation should be done at the end of each sentence. If the search results mention multiple subjects "
        "with the same name, create separate answers for each. Only include information found in the results and "
        "don't add any additional information. Make sure the answer is correct and don't output false content. "
        "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "
        "search results which has nothing to do with the question. Only answer what is asked. The "
        "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
    )

    prompt += f"Query: {question}\nAnswer:"
    answer = generate_text(openAI_key, prompt, "text-davinci-003")
#     answer = handle_message(prompt)
    return answer

def generate_text(openAI_key, prompt, engine="text-davinci-003"):
    openai.api_key = openAI_key
    completions = openai.Completion.create(
        engine=engine,
        prompt=prompt,
        max_tokens=512,
        n=1,
        stop=None,
        temperature=0.7,
    )
    message = completions.choices[0].text
    return message

openAI_key = 'sk-zo59kJ9gV7yx8xgsn8jrT3BlbkFJT******'
generate_answer(question, openAI_key)

结语

以上类似于AutoGPT或chatPDF的实现原理，感兴趣的读者可以试试。

付费后可以获得中文分词模型450M及4个Openai-Key【共享】

如忘记保存，或后续再查看,可凭"订单号" 手动获取

打赏9.9元

手机端：用系统浏览器访问本链接，打开支付宝完成打赏

说明

电脑端，使用手机支付宝直接扫码，完成打赏，点击“直接获取”按钮获取资料。“手动获取”时，注意订单号的格式（参见右侧“订单号示例”）。