Spaces:
Runtime error
Runtime error
ddovidovich
commited on
Commit
•
22b0051
1
Parent(s):
2f7ab4e
update
Browse files- app.py +46 -51
- packages.txt +4 -0
- requirements.txt +11 -8
app.py
CHANGED
@@ -1,57 +1,52 @@
|
|
1 |
-
from dotenv import load_dotenv
|
2 |
import streamlit as st
|
3 |
-
import os
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
11 |
|
12 |
os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
if pdf is not None:
|
24 |
-
pdf_reader = PdfReader(pdf)
|
25 |
-
text = ""
|
26 |
-
for page in pdf_reader.pages:
|
27 |
-
text += page.extract_text()
|
28 |
-
|
29 |
-
# split into chunks
|
30 |
-
text_splitter = CharacterTextSplitter(
|
31 |
-
separator="\n",
|
32 |
-
chunk_size=1000,
|
33 |
-
chunk_overlap=200,
|
34 |
-
length_function=len
|
35 |
-
)
|
36 |
-
chunks = text_splitter.split_text(text)
|
37 |
-
|
38 |
-
# create embeddings
|
39 |
-
embeddings = OpenAIEmbeddings()
|
40 |
-
knowledge_base = FAISS.from_texts(chunks, embeddings)
|
41 |
-
|
42 |
-
# show user input
|
43 |
-
user_question = st.text_input("Ask a question about your PDF:")
|
44 |
-
if user_question:
|
45 |
-
docs = knowledge_base.similarity_search(user_question)
|
46 |
-
|
47 |
-
llm = OpenAI()
|
48 |
-
chain = load_qa_chain(llm, chain_type="stuff")
|
49 |
-
with get_openai_callback() as cb:
|
50 |
-
response = chain.run(input_documents=docs, question=user_question)
|
51 |
-
print(cb)
|
52 |
-
|
53 |
-
st.write(response)
|
54 |
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
if
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import os
|
3 |
+
import nltk
|
4 |
+
import json
|
5 |
+
|
6 |
+
#os.system('chmod 777 /tmp')
|
7 |
+
#os.system('apt-get update -y')
|
8 |
+
#os.system('apt-get install tesseract-ocr -y')
|
9 |
+
#os.system('pip install -q pytesseract')
|
10 |
+
#os.system('apt-get install poppler-utils')
|
11 |
|
12 |
os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")
|
13 |
|
14 |
+
from langchain.document_loaders import PyPDFLoader
|
15 |
+
from langchain.vectorstores import Chroma
|
16 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
17 |
+
from langchain.document_loaders import UnstructuredPDFLoader
|
18 |
+
from langchain.chat_models import ChatOpenAI
|
19 |
+
from langchain.chains.question_answering import load_qa_chain
|
20 |
+
from PIL import Image
|
21 |
+
from datetime import datetime
|
22 |
+
from tempfile import NamedTemporaryFile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
st.subheader("Upload CV in PDF or image format")
|
25 |
+
uploaded_file = st.file_uploader("Upload PDF or Images", type=["pdf","png","jpg","jpeg"])
|
26 |
+
|
27 |
+
nltk.download('punkt')
|
28 |
+
nltk.download('averaged_perceptron_tagger')
|
29 |
|
30 |
+
if uploaded_file:
|
31 |
+
file_name, file_extension = os.path.splitext(uploaded_file.name)
|
32 |
+
if file_extension != '.pdf':
|
33 |
+
uploaded_image = Image.open(uploaded_file)
|
34 |
+
img = uploaded_image.convert('RGB')
|
35 |
+
loader = UnstructuredPDFLoader(img)
|
36 |
+
img.save(file_name+'.pdf')
|
37 |
+
PDFFileName = file_name+'.pdf'
|
38 |
+
else:
|
39 |
+
with NamedTemporaryFile(dir='.', suffix='.pdf') as f:
|
40 |
+
f.write(uploaded_file.getbuffer())
|
41 |
+
PDFFileName = f.name
|
42 |
+
loader = UnstructuredPDFLoader(PDFFileName)
|
43 |
+
pages = loader.load_and_split()
|
44 |
+
embeddings = OpenAIEmbeddings()
|
45 |
+
docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
|
46 |
+
|
47 |
+
current_date = datetime.now()
|
48 |
+
query = "Output informatio, (all in English), from the document in JSON format: full name, age, languages, education, school, places of work, skills.If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
|
49 |
+
docs = docsearch.get_relevant_documents(query)
|
50 |
+
chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
|
51 |
+
output = chain.run(input_documents=docs, question=query)
|
52 |
+
st.write(output)
|
packages.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
poppler-utils
|
2 |
+
tesseract-ocr
|
3 |
+
chromium
|
4 |
+
chromium-driver
|
requirements.txt
CHANGED
@@ -1,8 +1,11 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
streamlit
|
4 |
+
langchain
|
5 |
+
openai
|
6 |
+
Pillow
|
7 |
+
unstructured
|
8 |
+
chromadb
|
9 |
+
tiktoken
|
10 |
+
pypdfium2
|
11 |
+
unstructured_inference
|