from PyPDF2 import PdfReader from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS, Chroma, Pinecone from langchain.chains.question_answering import load_qa_chain from langchain.llms import OpenAI from gradio import gradio as gr from langchain.chat_models import ChatOpenAI from langchain.schema import AIMessage, HumanMessage import os # os.environ["OPENAI_API_KEY"] = 'sk-U' OPENAI_API_KEY=os.getenv('OPENAI_API_KEY') import pinecone # 初始化 pinecone pinecone.init( api_key=os.getenv('pinecone_api_key'), environment="gcp-starter" ) index_name="pdf-index" pdf_files = ['./ANSYS_Fluent_Text_Command_List.pdf'] raw_text = '' for file in pdf_files: reader = PdfReader(file) for i, page in enumerate(reader.pages): text = page.extract_text() if text: raw_text += text text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) # texts = text_splitter.split_text(raw_text) with open("output.txt", "w", encoding="utf-8") as file: file.write(raw_text) texts = text_splitter.split_text(raw_text) embeddings = OpenAIEmbeddings() # 持久化数据 #docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name) # 加载数据 docsearch = Pinecone.from_existing_index(index_name, embeddings) #embeddings = OpenAIEmbeddings() #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") #docsearch = FAISS.from_texts(texts, embeddings) chain = load_qa_chain(OpenAI(), chain_type="stuff") #llm = ChatOpenAI(temperature=1.0, model='gpt-3.5-turbo-0613') def predict(message, history): history_langchain_format = [] for human, ai in history: history_langchain_format.append(HumanMessage(content=human)) history_langchain_format.append(AIMessage(content=ai)) history_langchain_format.append(HumanMessage(content=message)) docs = docsearch.similarity_search(message) response = chain.run(input_documents=docs, question=message) partial_message = "" for chunk in response: if len(chunk['choices'][0]['delta']) != 0: partial_message = partial_message + chunk['choices'][0]['delta']['content'] yield partial_message #return response gr.ChatInterface(predict).queue().launch()