yungewww commited on
Commit
cdc1e06
1 Parent(s): 78eb289

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +128 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import bs4
4
+ from langchain_openai import ChatOpenAI
5
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
6
+ from langchain_chroma import Chroma
7
+ from langchain_openai import OpenAIEmbeddings
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ from langchain_core.runnables import RunnablePassthrough
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+ from langchain import hub
12
+ from bs4 import BeautifulSoup
13
+ import requests
14
+ from langchain_core.prompts import ChatPromptTemplate
15
+
16
+
17
+ os.environ["OPENAI_API_KEY"] = "sk-None-I5QCG8e21NqWVwxcHz2QT3BlbkFJUMfGESJ2JMWLZUwA4zPg"
18
+ llm = ChatOpenAI(model="gpt-4o-mini")
19
+ system_prompt = ChatPromptTemplate.from_messages([
20
+ ("system", """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
21
+ If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
22
+ Question: {question}
23
+ Context: {context}
24
+ Answer:"""),
25
+ ("user", "{question}, {context}")
26
+ ])
27
+
28
+ def read_url(url):
29
+ response = requests.get(url)
30
+ html_content = response.text
31
+ paragraphs = BeautifulSoup(html_content, 'html.parser').find_all('p')
32
+
33
+ full_content = ""
34
+ for p in paragraphs:
35
+ full_content += p.get_text()
36
+
37
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
38
+ splits = text_splitter.create_documents([full_content])
39
+ return splits
40
+
41
+ def read_file(file):
42
+ if file.name.endswith('.pdf'):
43
+ loader = PyPDFLoader(file.name)
44
+ pages = loader.load_and_split()
45
+
46
+ elif file.name.endswith('.txt') or file.name.endswith('.md'):
47
+ loader = TextLoader(file.name)
48
+ pages_no_split = loader.load()
49
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20, add_start_index=True)
50
+ pages = text_splitter.split_documents(pages_no_split) # ❤
51
+ else:
52
+ return None
53
+
54
+ return pages
55
+
56
+ def output_format_docs(docs):
57
+ formatted_docs = [
58
+ f"\n ========== THE {i+1} KNOWLEDGE SNIPPET ========== \n{doc.page_content}"
59
+ for i, doc in enumerate(docs)
60
+ ]
61
+ return "\n".join(formatted_docs)
62
+
63
+ def format_docs(docs):
64
+ return "\n\n".join(doc.page_content for doc in docs)
65
+
66
+ # ==================== GRADIO START ====================
67
+ def greet(prompt, file, url):
68
+ if prompt == "":
69
+ return "You haven't enter the question yet!", ''
70
+ elif url == '':
71
+ file_splits = read_file(file)
72
+ all_splits = file_splits
73
+ else:
74
+ url_splits = read_url(url)
75
+ all_splits = url_splits
76
+
77
+ vectorstore = Chroma(
78
+ collection_name = "example_collection",
79
+ embedding_function = OpenAIEmbeddings(),
80
+ # persist_directory = "./chroma_langchain_db", # Where to save data locally, remove if not neccesary
81
+ )
82
+ vectorstore.add_documents(documents = all_splits)
83
+
84
+ retriever = vectorstore.as_retriever()
85
+ retrieved_docs = retriever.invoke(prompt)
86
+ formatted_doc = format_docs(retrieved_docs)
87
+
88
+ chain = system_prompt | llm | StrOutputParser()
89
+ complete_sentence = chain.invoke({"question": prompt, "context": formatted_doc})
90
+
91
+ output_0 = output_format_docs(retrieved_docs)
92
+ output_1 = complete_sentence
93
+
94
+ vectorstore.delete_collection()
95
+
96
+
97
+ return output_0, output_1
98
+
99
+
100
+ demo = gr.Interface(fn=greet,
101
+ inputs=[gr.Textbox(label = 'PROMPT', info = 'Feel free to ask the Bot your questions here!', lines = 5, placeholder = """Examples:
102
+ "What are the key findings of the latest financial report?"
103
+ "Can you summarize the main legal requirements for data privacy in this document?"
104
+ "What are the recommended treatment options for [specific medical condition] mentioned in the report?"
105
+ """),
106
+ gr.File(
107
+ file_types = ['.pdf', '.txt', '.md'],
108
+ label = 'Support PDF、TXT、MD',
109
+ # value = './story.txt'
110
+ ),
111
+ gr.Textbox(label = 'URL', info = 'Please paste your URL and ask question about the web page!')],
112
+ outputs = [gr.Textbox(label = 'Knowledge Snippets', info = 'These are the knowledge snippets detected by the system. Do you think they are accurate?'), gr.Textbox(label = 'BOT OUTPUT (gpt-4o-mini)', info = "These are the knowledge snippets detected by the system. Do you think they are accurate?")],
113
+
114
+ title = "Enhancing LLM Accuracy with Retrieval-Augmented Generation (RAG)",
115
+ description = """\n
116
+ Large language models (LLM) today often fall short in providing accurate specialized information. Inquiries related to fields such as medicine, law, or finance may result in inaccurate responses.\n
117
+ Retrieval-Augmented Generation (RAG) is a widely adopted solution to this challenge. By storing specialized knowledge in a database, RAG enables Bots to search the knowledge base and generate precise, expert-level responses.\n
118
+ This methodology not only allows businesses to develop Bots tailored to their specific operations by incorporating proprietary data and knowledge but also ensures enhanced security by hosting the knowledge base on their own servers, thereby reducing the risk of data breaches.\n
119
+ Try to upload your own documents or a URL below:"""
120
+ )
121
+ demo.launch(debug=True)
122
+
123
+
124
+
125
+
126
+
127
+
128
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchainhub
4
+ langchain-openai
5
+ langchain-chroma
6
+ bs4
7
+ pypdf
8
+ gradio