import gradio as gr import os import bs4 from langchain_openai import ChatOpenAI from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain_chroma import Chroma from langchain_openai import OpenAIEmbeddings from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain import hub from bs4 import BeautifulSoup import requests from langchain_core.prompts import ChatPromptTemplate os.environ["OPENAI_API_KEY"] = "sk-None-I5QCG8e21NqWVwxcHz2QT3BlbkFJUMfGESJ2JMWLZUwA4zPg" llm = ChatOpenAI(model="gpt-4o-mini") system_prompt = ChatPromptTemplate.from_messages([ ("system", """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. If the question is in Chinese, explain in Chisese. 如果问题以中文提问,那么就用流畅自然的中文来回答。 Question: {question} Context: {context} Answer:"""), ("user", "{question}, {context}") ]) # vectorstore = Chroma( # collection_name = "example_collection", # embedding_function = OpenAIEmbeddings(), # # persist_directory = "./chroma_langchain_db", # Where to save data locally, remove if not neccesary # ) def read_url(url): response = requests.get(url) html_content = response.text paragraphs = BeautifulSoup(html_content, 'html.parser').find_all('p') full_content = "" for p in paragraphs: full_content += p.get_text() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True) splits = text_splitter.create_documents([full_content]) return splits def read_file(file): if file.name.endswith('.pdf'): loader = PyPDFLoader(file.name) pages = loader.load_and_split() elif file.name.endswith('.txt') or file.name.endswith('.md'): loader = TextLoader(file.name) pages_no_split = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20, add_start_index=True) pages = text_splitter.split_documents(pages_no_split) # ❤ else: return None return pages def output_format_docs(docs): formatted_docs = [ f"\n ========== 第{i+1}个知识片段 ========== \n{doc.page_content}" for i, doc in enumerate(docs) ] return "\n".join(formatted_docs) def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) # ==================== GRADIO START ==================== def greet(prompt, file, url): if prompt == "": return '你还没有输入问题哦', '' elif url == '': file_splits = read_file(file) all_splits = file_splits else: url_splits = read_url(url) all_splits = url_splits vectorstore = Chroma( collection_name = "example_collection", embedding_function = OpenAIEmbeddings(), # persist_directory = "./chroma_langchain_db", # Where to save data locally, remove if not neccesary ) vectorstore.add_documents(documents = all_splits) retriever = vectorstore.as_retriever() retrieved_docs = retriever.invoke(prompt) formatted_doc = format_docs(retrieved_docs) chain = system_prompt | llm | StrOutputParser() complete_sentence = chain.invoke({"question": prompt, "context": formatted_doc}) output_0 = output_format_docs(retrieved_docs) output_1 = complete_sentence vectorstore.delete_collection() return output_0, output_1 demo = gr.Interface(fn=greet, inputs=[gr.Textbox(label = 'PROMPT', info = '在这里向Bot提问吧!', lines = 5, placeholder = """例子: - 故事中有哪些角色? - 谁是小明?(能够识别关键词是否出现在知识库中) - 大灰狼为什么要吹倒小猪的房子? """), gr.File( file_types = ['.pdf', '.txt', '.md'], label = '支持PDF、TXT、MD格式', value = './story.txt'), gr.Textbox(label = 'URL', info = '试试黏贴你感兴趣的网页链接,然后就网页中的内容提问吧!')], outputs = [gr.Textbox(label = '知识片段', info = '这是系统检测到的知识片段,你觉得准不准确呢?'), gr.Textbox(label = 'BOT OUTPUT (gpt-4o-mini)', info = "这是Bot根据知识片段的回答,你觉得准不准确呢?")], title = "用RAG知识库来提升你的大语言模型BOT准确率", description = """\n 虽然大语言模型发展迅速,已经能与我们进行自然语言交流,但总是无法提供准确的专业信息。如果问它一些医学、法律、财务等知识细节,它很有可能给出不准确的答案。\n RAG(Retrieval-Augmented Generation)是是一种目前常用的解决方案。我们先将专业知识存入数据库中,再问一个Bot专业知识时,Bot会先搜索知识库,然后进行措辞给你专业的回答。\n 如此一来,企业不仅能结合自己的数据和知识来开发Bot,让Bot与业务更加适配,还能将知识库存在自己的服务器中,避免了数据泄露的安全隐患。\n 《三只小猪》这个故事已经存入了知识库,快来试试向Bot提问吧!""" ) demo.launch(debug=True)