fplevit commited on
Commit
ae2616f
1 Parent(s): 29cdd97

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import WebBaseLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_openai import OpenAIEmbeddings,ChatOpenAI
4
+ from langchain_community.vectorstores.faiss import FAISS
5
+
6
+ from langchain.chains import LLMChain
7
+ from dotenv import find_dotenv, load_dotenv
8
+
9
+ from langchain_core.prompts.chat import (
10
+ ChatPromptTemplate,
11
+ SystemMessagePromptTemplate,
12
+ HumanMessagePromptTemplate,
13
+ )
14
+ import gradio as gr
15
+
16
+ load_dotenv(find_dotenv())
17
+ embeddings = OpenAIEmbeddings()
18
+
19
+ import requests
20
+ from bs4 import BeautifulSoup
21
+ from urllib.parse import urlparse, urljoin
22
+
23
+ def extract_subdomain_urls(subdomain):
24
+ response = requests.get(subdomain)
25
+ # Parse the HTML content using BeautifulSoup
26
+ soup = BeautifulSoup(response.text, "html.parser")
27
+ # Extract all anchor tags (links) from the parsed HTML
28
+ anchors = soup.find_all("a")
29
+ # Extract and normalize the URLs within the subdomain
30
+ base_url = urlparse(subdomain).scheme + "://" + urlparse(subdomain).netloc
31
+ subdomain_urls = []
32
+ for anchor in anchors:
33
+ href = anchor.get("href")
34
+ if href:
35
+ url = urljoin(base_url, href)
36
+ if urlparse(url).netloc == urlparse(subdomain).netloc:
37
+ subdomain_urls.append(url)
38
+
39
+ return subdomain_urls
40
+
41
+ # Retrieve all pages from handbook
42
+ subdomain = "https://i14y-ch.github.io/handbook/de/"
43
+ urls = extract_subdomain_urls(subdomain)
44
+
45
+ # Use langchain WebBaseLoader to load the handbook, then split into pages to stay under GPT tokens usage threshold
46
+ loader = WebBaseLoader(urls)
47
+ loader.requests_per_second = 1
48
+ pages = loader.load_and_split()
49
+
50
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
51
+ docs = text_splitter.split_documents(pages)
52
+
53
+
54
+ #Embed in a FAISS vectorspace
55
+ db = FAISS.from_documents(docs, embeddings)
56
+
57
+ #Define the function that creates a chat prompt given a user query
58
+ def get_response_from_query(query):
59
+ """
60
+ gpt-3.5-turbo can handle up to 4097 tokens. Setting the chunksize to 1000 and k to 4 maximizes
61
+ the number of tokens to analyze.
62
+ """
63
+ docs = db.similarity_search(query, k=4)
64
+ docs_page_content = " ".join([d.page_content for d in docs])
65
+
66
+ chat = ChatOpenAI(temperature=0)
67
+
68
+ # System message prompt
69
+ template = """
70
+ The I14Y interoperability platform is the central directory of data, electronic interfaces and authority services in Switzerland.
71
+ You are a helpful assistant that answers questions about I14Y based on the platform handbook, of which {docs} is an extract.
72
+ Given a question from a user, you create a final answer based on the information in {docs}.
73
+ Whenever you have this information, you must cite the relevant section title of the handbook that you used in your answer.
74
+ If you don't have enough information to answer the question, politely state that you don't know. Do not make up answers.
75
+ If you don't understand the question, ask the user to reformulate it.
76
+ If the question is not about the I14Y interoperability platform, say that you only answer question about I14Y.
77
+ Ensure your answers are detailed, concise, and relevant, providing step-by-step instructions if needed.
78
+ You are very polite and always greet the user with "Grüezi".
79
+ At the end of your answer, ask politely the user if they need any further information.
80
+ Do not include references to platforms other than I14Y in your answers such as for example Geocat.
81
+ Answer in the language in which the question was asked.
82
+ I14Y stands for Interoperability. The user may call I14Y "IOP" but you should not use this name in your answer.
83
+ """
84
+ system_message_prompt = SystemMessagePromptTemplate.from_template(template)
85
+
86
+ # Human question prompt
87
+ human_template = "Answer the following question: {question}"
88
+ human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
89
+
90
+ chat_prompt = ChatPromptTemplate.from_messages(
91
+ [system_message_prompt, human_message_prompt]
92
+ )
93
+
94
+ chain = LLMChain(llm=chat, prompt=chat_prompt)
95
+ response = chain.invoke({'question': query, 'docs':docs_page_content})['text']
96
+
97
+ return response
98
+
99
+
100
+ with gr.Blocks(title="I14Y Chatbot", theme=gr.themes.Default(font=gr.themes.GoogleFont("Roboto"), primary_hue="red", secondary_hue="pink")) as demo:
101
+
102
+ input = gr.Textbox(label="Frage mich etwas über die I14Y")
103
+ output = gr.Textbox(label="Antwort")
104
+ search_btn = gr.Button("Frage stellen")
105
+ search_btn.click(fn=get_response_from_query, inputs=input, outputs=output, api_name="Frage stellen")
106
+
107
+ demo.launch(share=False)