fracapuano commited on
Commit
51fe9d2
1 Parent(s): ffc32ba

Add files via upload

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: AISandbox
3
- emoji: 🐠
4
  colorFrom: blue
5
- colorTo: pink
6
  sdk: streamlit
7
- sdk_version: 1.26.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: Chat With Files
3
+ emoji:
4
  colorFrom: blue
5
+ colorTo: yellow
6
  sdk: streamlit
7
+ sdk_version: 1.25.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+
4
+ st.set_page_config(page_title="2023 FS Hackathon")
5
+ st.title("Founder's Studio AI Sandbox 🕹️")
6
+ expander = st.expander("Click here to close this intro", expanded=True)
7
+ expander.write(
8
+ """
9
+ This web app allows you to perform common Natural Language Processing tasks, select a task below to get started.
10
+ These tasks are intended to help you validate your intuition and build a proof of concept for your idea.
11
+
12
+ If a task you deem useful is not listed here, feel free to get in touch with Founder's Studio team at francesco.capuano@bain.com.
13
+ Happy hackathon!
14
+ """
15
+ )
16
+
17
+ st.subheader(":point_down: Use the following drop-down menu to select a task!")
18
+
19
+ OPTION1="Chat wiht a file"
20
+ OPTION2="Text summarization"
21
+ OPTION_N="OTHER"
22
+
23
+ option = st.selectbox("Please select a task 🤖",
24
+ options=[OPTION1, OPTION2, OPTION_N],
25
+ )
26
+
27
+ if option == "OTHER":
28
+ user_suggestion = st.text_input("Please specify the task you would like to perform", value="")
29
+ if user_suggestion:
30
+
31
+ st.write("Thanks for your suggestion, we will get back to you soon!")
32
+ st.stop()
33
+
34
+ if option == OPTION1:
35
+ from qa import qa_main
36
+ with st.container():
37
+ qa_main()
38
+
39
+ elif option == OPTION2:
40
+ from summarization import summarization_main
41
+ with st.container():
42
+ summarization_main()
43
+
44
+ elif option==OPTION_N:
45
+ raise NotImplementedError("This option is not yet implemented, please select another one")
qa/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .qa import *
qa/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (173 Bytes). View file
 
qa/__pycache__/embeddings.cpython-310.pyc ADDED
Binary file (4.04 kB). View file
 
qa/__pycache__/prompts.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
qa/__pycache__/qa.cpython-310.pyc ADDED
Binary file (3.37 kB). View file
 
qa/__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.46 kB). View file
 
qa/prompts.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import PromptTemplate
2
+
3
+ ## One might consider using a shorter template to reduce the number of tokens in the model input
4
+ template = """Create a final answer to the given questions using the provided document (in no particular order) as references. ALWAYS include a "SOURCES" section in your answer including only the minimal set of sources needed to answer the question. If you are unable to answer the question, simply state that you do not know. Do not attempt to fabricate an answer and leave the SOURCES section empty.
5
+ ---------
6
+ QUESTION: What is the purpose of ARPA-H?
7
+ =========
8
+ Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
9
+ Source: 1-32
10
+ Content: While we’re at it, let’s make sure every American can get the health care they need. \n\nWe’ve already made historic investments in health care. \n\nWe’ve made it easier for Americans to get the care they need, when they need it. \n\nWe’ve made it easier for Americans to get the treatments they need, when they need them. \n\nWe’ve made it easier for Americans to get the medications they need, when they need them.
11
+ Source: 1-33
12
+ Content: The V.A. is pioneering new ways of linking toxic exposures to disease, already helping veterans get the care they deserve. \n\nWe need to extend that same care to all Americans. \n\nThat’s why I’m calling on Congress to pass legislation that would establish a national registry of toxic exposures, and provide health care and financial assistance to those affected.
13
+ Source: 1-30
14
+ =========
15
+ FINAL ANSWER: The purpose of ARPA-H is to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
16
+ SOURCES: 1-32
17
+ ---------
18
+ QUESTION: {question}
19
+ =========
20
+ {summaries}
21
+ =========
22
+ FINAL ANSWER:"""
23
+
24
+ STUFF_PROMPT = PromptTemplate(
25
+ template=template, input_variables=["summaries", "question"]
26
+ )
qa/qa.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_chat import message
3
+ from openai.error import OpenAIError
4
+ from .utils import (
5
+ parse_docx,
6
+ parse_pdf,
7
+ parse_txt,
8
+ search_docs,
9
+ embed_docs,
10
+ text_to_docs,
11
+ get_answer,
12
+ )
13
+ from uuid import uuid4
14
+
15
+ def clear_submit():
16
+ st.session_state["submit"] = False
17
+
18
+ def set_openai_api_key(api_key: str):
19
+ st.session_state["OPENAI_API_KEY"] = api_key
20
+
21
+ def qa_main():
22
+ st.markdown("<h1>This app allows to chat with files!</h1>", unsafe_allow_html=True)
23
+ st.markdown(\
24
+ """
25
+ Developed using LangChain and OpenAI Embeddings.</p>
26
+ Before hitting on "Submit", please make sure you have uploaded a file and entered a question.
27
+
28
+ You can upload files using the sidebar on the left.
29
+ """,
30
+ unsafe_allow_html=True
31
+ )
32
+ index = None
33
+ doc = None
34
+
35
+ with st.sidebar:
36
+ user_secret = st.text_input(
37
+ "OpenAI API Key",
38
+ type="password",
39
+ placeholder="Paste your OpenAI API key here (sk-...)",
40
+ help="You can get your API key from https://platform.openai.com/account/api-keys.",
41
+ value=st.session_state.get("OPENAI_API_KEY", ""),
42
+ )
43
+ if user_secret:
44
+ set_openai_api_key(user_secret)
45
+
46
+ uploaded_file = st.file_uploader(
47
+ "Upload a pdf, docx, or txt file",
48
+ type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
49
+ help="Scanned documents are not supported yet!",
50
+ on_change=clear_submit,
51
+ accept_multiple_files=False,
52
+ )
53
+ # reading the files
54
+ if uploaded_file is not None:
55
+ if uploaded_file.name.endswith(".pdf"):
56
+ doc = parse_pdf(uploaded_file)
57
+ elif uploaded_file.name.endswith(".docx"):
58
+ doc = parse_docx(uploaded_file)
59
+ elif uploaded_file.name.endswith(".txt"):
60
+ doc = parse_txt(uploaded_file)
61
+ else:
62
+ st.error("File type not yet supported! Supported files: [.pdf, .docx, .txt]")
63
+ doc = None
64
+
65
+ text = text_to_docs(text=tuple(doc))
66
+ st.write(text[:1])
67
+
68
+ try:
69
+ with st.spinner("Indexing document(s)... This may take some time."):
70
+ index = embed_docs(tuple(text))
71
+ st.session_state["api_key_configured"] = True
72
+ except OpenAIError as e:
73
+ st.error(e._message)
74
+
75
+ tab1, tab2 = st.tabs(["Chat With File", "About the Application"])
76
+ with tab1:
77
+ if 'generated' not in st.session_state:
78
+ st.session_state['generated'] = []
79
+
80
+ if 'past' not in st.session_state:
81
+ st.session_state['past'] = []
82
+
83
+ def get_text():
84
+ if user_secret:
85
+ st.header("Ask me something about the document:")
86
+ input_text = st.text_area("You:", on_change=clear_submit)
87
+ return input_text
88
+
89
+ user_input = get_text()
90
+
91
+ button = st.button("Submit")
92
+ if button or st.session_state.get("submit"):
93
+ if not user_input:
94
+ st.error("Please enter a question!")
95
+ else:
96
+ st.session_state["submit"] = True
97
+ sources = search_docs(index, user_input)
98
+ try:
99
+ answer = get_answer(sources, user_input)
100
+
101
+ st.session_state.past.append(user_input)
102
+ st.session_state.generated.append(answer["output_text"])
103
+
104
+ except OpenAIError as e:
105
+ st.error(e._message)
106
+
107
+ if st.session_state['past']:
108
+ for i in range(len(st.session_state['past'])-1, -1, -1):
109
+ message(st.session_state['generated'][i], key=str(uuid4()))
110
+ message(st.session_state['past'][i], is_user=True, key=str(uuid4()))
111
+
112
+ with tab2:
113
+ st.write('See sources')
114
+
115
+ # st.write('Chat with Files enables user to extract all the information from a file. User can obtain the transcription, the embedding of each segment and also ask questions to the file through a chat.')
116
+ # st.write('Features include- ')
117
+ # st.write('1. Reading any pdf, docx or plain txt (such as python programs) file')
118
+ # st.write('2. Embedding texts segments with Langchain and OpenAI')
119
+ # st.write('3. Chatting with the file using streamlit-chat and LangChain QA with source and the GPT4 model')
qa/utils.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.vectorstores.faiss import FAISS
3
+ from langchain import OpenAI
4
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.llms import OpenAI
7
+ from langchain.docstore.document import Document
8
+ from langchain.vectorstores import FAISS, VectorStore
9
+ import docx2txt
10
+ from typing import List, Dict, Any, Union, Text, Tuple
11
+ import re
12
+ from io import BytesIO
13
+ import streamlit as st
14
+ from .prompts import STUFF_PROMPT
15
+ from pypdf import PdfReader
16
+ from openai.error import AuthenticationError
17
+
18
+ class HashDocument(Document):
19
+ """A document that uses the page content as the hash."""
20
+ def __hash__(self):
21
+ content = self.page_content + "".join(self.metadata[k] for k in self.metadata.keys())
22
+ return hash(content)
23
+
24
+ @st.cache_data
25
+ def parse_docx(file: BytesIO) -> str:
26
+ text = docx2txt.process(file)
27
+ # Remove multiple newlines
28
+ text = re.sub(r"\n\s*\n", "\n\n", text)
29
+ return text
30
+
31
+
32
+ @st.cache_data
33
+ def parse_pdf(file: BytesIO) -> List[str]:
34
+ pdf = PdfReader(file)
35
+ output = []
36
+ for page in pdf.pages:
37
+ text = page.extract_text()
38
+ # Merge hyphenated words
39
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
40
+ # Fix newlines in the middle of sentences
41
+ text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
42
+ # Remove multiple newlines
43
+ text = re.sub(r"\n\s*\n", "\n\n", text)
44
+
45
+ output.append(text)
46
+
47
+ return output
48
+
49
+
50
+ @st.cache_data
51
+ def parse_txt(file: BytesIO) -> str:
52
+ text = file.read().decode("utf-8")
53
+ # Remove multiple newlines
54
+ text = re.sub(r"\n\s*\n", "\n\n", text)
55
+ return text
56
+
57
+
58
+ @st.cache_data
59
+ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
60
+ """
61
+ Converts a string or frozenset of strings to a list of Documents
62
+ with metadata.
63
+ """
64
+ if isinstance(text, str):
65
+ # Take a single string as one page
66
+ text = tuple([text])
67
+ elif isinstance(text, tuple):
68
+ # map each page into a document instance
69
+ page_docs = [HashDocument(page_content=page) for page in text]
70
+ # Add page numbers as metadata
71
+ for i, doc in enumerate(page_docs):
72
+ doc.metadata["page"] = i + 1
73
+ # Split pages into chunks
74
+ doc_chunks = []
75
+ # text splitter to split the text into chunks
76
+ text_splitter = RecursiveCharacterTextSplitter(
77
+ chunk_size=800,
78
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
79
+ chunk_overlap=20, # minimal overlap to capture sematic overlap across chunks
80
+ )
81
+
82
+ for doc in page_docs:
83
+ chunks = text_splitter.split_text(doc.page_content)
84
+ for i, chunk in enumerate(chunks):
85
+ # Create a new document for each individual chunk
86
+ doc = HashDocument(
87
+ page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
88
+ )
89
+ # Add sources a metadata
90
+ doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
91
+ doc_chunks.append(doc)
92
+
93
+ return doc_chunks
94
+
95
+ else:
96
+ raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
97
+
98
+
99
+ @st.cache_data
100
+ def embed_docs(_docs: Tuple[Document]) -> VectorStore:
101
+ """Embeds a list of Documents and returns a FAISS index"""
102
+ docs = _docs
103
+ if not st.session_state.get("OPENAI_API_KEY"):
104
+ raise AuthenticationError(
105
+ "Enter your OpenAI API key in the sidebar. You can get a key at https://platform.openai.com/account/api-keys."
106
+ )
107
+ else:
108
+ # Embed the chunks
109
+ embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
110
+ index = FAISS.from_documents(list(docs), embeddings)
111
+
112
+ return index
113
+
114
+ @st.cache_data
115
+ def search_docs(_index: VectorStore, query: str) -> List[Document]:
116
+ """Searches a FAISS index for similar chunks to the query
117
+ and returns a list of Documents."""
118
+
119
+ # Search for similar chunks
120
+ docs = _index.similarity_search(query, k=5)
121
+ return docs
122
+
123
+
124
+ @st.cache_data
125
+ def get_answer(_docs: List[Document], query: str) -> Dict[str, Any]:
126
+ """Gets an answer to a question from a list of Documents."""
127
+ # Get the answer
128
+ chain = load_qa_with_sources_chain(
129
+ OpenAI(temperature=0,
130
+ openai_api_key=st.session_state.get("OPENAI_API_KEY")),
131
+ chain_type="stuff",
132
+ prompt=STUFF_PROMPT
133
+ )
134
+ # also returnig the text of the source used to form the answer
135
+ answer = chain(
136
+ {"input_documents": _docs, "question": query}
137
+ )
138
+ return answer
139
+
140
+
141
+ @st.cache_data
142
+ def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
143
+ """Gets the source documents for an answer."""
144
+
145
+ # Get sources for the answer
146
+ source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
147
+
148
+ source_docs = []
149
+ for doc in docs:
150
+ if doc.metadata["source"] in source_keys:
151
+ source_docs.append(doc)
152
+
153
+ return source_docs
154
+
155
+ def wrap_text_in_html(text: str) -> str:
156
+ """Wraps each text block separated by newlines in <p> tags"""
157
+ if isinstance(text, list):
158
+ # Add horizontal rules between pages
159
+ text = "\n<hr/>\n".join(text)
160
+ return "".join([f"<p>{line}</p>" for line in text.split("\n")])
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ docx2txt==0.8
2
+ langchain==0.0.274
3
+ openai==0.27.9
4
+ pypdf==3.15.4
5
+ streamlit==1.25.0
6
+ streamlit_chat==0.1.1
7
+ tenacity==8.2.3
8
+ transformers==4.32.0
9
+ altair<5
10
+ torch==2.0.1
11
+ tiktoken==0.4.0
12
+ faiss-cpu==1.7.4
summarization/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .summarization import *
summarization/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (195 Bytes). View file
 
summarization/__pycache__/summarization.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
summarization/summarization.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+
4
+ @st.cache_resource
5
+ def summarization_model():
6
+ model_name = "google/pegasus-xsum"
7
+ summarizer = pipeline(
8
+ model=model_name,
9
+ tokenizer=model_name,
10
+ task="summarization"
11
+ )
12
+ return summarizer
13
+
14
+ def summarization_main():
15
+ st.markdown("<h2 style='text-align: center; color:grey;'>Text Summarization</h2>", unsafe_allow_html=True)
16
+ st.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>What is text summarization about?<b></h3>", unsafe_allow_html=True)
17
+ st.write("Text summarization is producing a shorter version of a given text while preserving its important information.")
18
+ st.markdown('___')
19
+ source = st.radio("How would you like to start? Choose an option below", ["I want to input some text", "I want to upload a file"])
20
+ if source == "I want to input some text":
21
+ sample_text = ""
22
+ text = st.text_area("Input a text in English (10,000 characters max) or use the example below", value=sample_text, max_chars=10000, height=330)
23
+
24
+ button = st.button("Get summary")
25
+ if button:
26
+ with st.spinner(text="Loading summarization model..."):
27
+ summarizer = summarization_model()
28
+ with st.spinner(text="Summarizing text..."):
29
+ summary = summarizer(text, max_length=130, min_length=30)
30
+ st.text(summary[0]["summary_text"])
31
+
32
+ elif source == "I want to upload a file":
33
+ uploaded_file = st.file_uploader("Choose a .txt file to upload", type=["txt"])
34
+ if uploaded_file is not None:
35
+ raw_text = str(uploaded_file.read(),"utf-8")
36
+ text = st.text_area("", value=raw_text, height=330)
37
+ button = st.button("Get summary")
38
+ if button:
39
+ with st.spinner(text="Loading summarization model..."):
40
+ summarizer = summarization_model()
41
+ with st.spinner(text="Summarizing text..."):
42
+ summary = summarizer(text, max_length=130, min_length=30)
43
+ st.text(summary[0]["summary_text"])