ddovidovich commited on
Commit
22b0051
1 Parent(s): 2f7ab4e
Files changed (3) hide show
  1. app.py +46 -51
  2. packages.txt +4 -0
  3. requirements.txt +11 -8
app.py CHANGED
@@ -1,57 +1,52 @@
1
- from dotenv import load_dotenv
2
  import streamlit as st
3
- import os
4
- from PyPDF2 import PdfReader
5
- from langchain.text_splitter import CharacterTextSplitter
6
- from langchain.embeddings.openai import OpenAIEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from langchain.chains.question_answering import load_qa_chain
9
- from langchain.llms import OpenAI
10
- from langchain.callbacks import get_openai_callback
 
11
 
12
  os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")
13
 
14
- def main():
15
- load_dotenv()
16
- st.set_page_config(page_title="Ask your PDF")
17
- st.header("Ask your PDF 💬")
18
-
19
- # upload file
20
- pdf = st.file_uploader("Upload your PDF", type="pdf")
21
-
22
- # extract the text
23
- if pdf is not None:
24
- pdf_reader = PdfReader(pdf)
25
- text = ""
26
- for page in pdf_reader.pages:
27
- text += page.extract_text()
28
-
29
- # split into chunks
30
- text_splitter = CharacterTextSplitter(
31
- separator="\n",
32
- chunk_size=1000,
33
- chunk_overlap=200,
34
- length_function=len
35
- )
36
- chunks = text_splitter.split_text(text)
37
-
38
- # create embeddings
39
- embeddings = OpenAIEmbeddings()
40
- knowledge_base = FAISS.from_texts(chunks, embeddings)
41
-
42
- # show user input
43
- user_question = st.text_input("Ask a question about your PDF:")
44
- if user_question:
45
- docs = knowledge_base.similarity_search(user_question)
46
-
47
- llm = OpenAI()
48
- chain = load_qa_chain(llm, chain_type="stuff")
49
- with get_openai_callback() as cb:
50
- response = chain.run(input_documents=docs, question=user_question)
51
- print(cb)
52
-
53
- st.write(response)
54
 
 
 
 
 
 
55
 
56
- if __name__ == '__main__':
57
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import nltk
4
+ import json
5
+
6
+ #os.system('chmod 777 /tmp')
7
+ #os.system('apt-get update -y')
8
+ #os.system('apt-get install tesseract-ocr -y')
9
+ #os.system('pip install -q pytesseract')
10
+ #os.system('apt-get install poppler-utils')
11
 
12
  os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")
13
 
14
+ from langchain.document_loaders import PyPDFLoader
15
+ from langchain.vectorstores import Chroma
16
+ from langchain.embeddings.openai import OpenAIEmbeddings
17
+ from langchain.document_loaders import UnstructuredPDFLoader
18
+ from langchain.chat_models import ChatOpenAI
19
+ from langchain.chains.question_answering import load_qa_chain
20
+ from PIL import Image
21
+ from datetime import datetime
22
+ from tempfile import NamedTemporaryFile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ st.subheader("Upload CV in PDF or image format")
25
+ uploaded_file = st.file_uploader("Upload PDF or Images", type=["pdf","png","jpg","jpeg"])
26
+
27
+ nltk.download('punkt')
28
+ nltk.download('averaged_perceptron_tagger')
29
 
30
+ if uploaded_file:
31
+ file_name, file_extension = os.path.splitext(uploaded_file.name)
32
+ if file_extension != '.pdf':
33
+ uploaded_image = Image.open(uploaded_file)
34
+ img = uploaded_image.convert('RGB')
35
+ loader = UnstructuredPDFLoader(img)
36
+ img.save(file_name+'.pdf')
37
+ PDFFileName = file_name+'.pdf'
38
+ else:
39
+ with NamedTemporaryFile(dir='.', suffix='.pdf') as f:
40
+ f.write(uploaded_file.getbuffer())
41
+ PDFFileName = f.name
42
+ loader = UnstructuredPDFLoader(PDFFileName)
43
+ pages = loader.load_and_split()
44
+ embeddings = OpenAIEmbeddings()
45
+ docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
46
+
47
+ current_date = datetime.now()
48
+ query = "Output informatio, (all in English), from the document in JSON format: full name, age, languages, education, school, places of work, skills.If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
49
+ docs = docsearch.get_relevant_documents(query)
50
+ chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
51
+ output = chain.run(input_documents=docs, question=query)
52
+ st.write(output)
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ poppler-utils
2
+ tesseract-ocr
3
+ chromium
4
+ chromium-driver
requirements.txt CHANGED
@@ -1,8 +1,11 @@
1
- langchain==0.0.154
2
- PyPDF2==3.0.1
3
- python-dotenv==1.0.0
4
- streamlit==1.18.1
5
- faiss-cpu==1.7.4
6
- altair<5
7
- openai
8
- tiktoken
 
 
 
 
1
+ numpy
2
+ pandas
3
+ streamlit
4
+ langchain
5
+ openai
6
+ Pillow
7
+ unstructured
8
+ chromadb
9
+ tiktoken
10
+ pypdfium2
11
+ unstructured_inference