Spaces:
Sleeping
Sleeping
yogjoshi14
commited on
Commit
•
696ea0a
1
Parent(s):
3b1a154
making document load changes
Browse files
README.md
CHANGED
@@ -38,3 +38,4 @@ Advanced Challenge:
|
|
38 |
1. Embedding : As the project has made use of readily available huggingface embeddings, it has max dimension of 768. We can make use of alternate embeddings such as HuggingFaceInstructEmbeddings, Ollama embeddings which are open-source or OpenAI embeddings.
|
39 |
2. LLM : Making use of llm which has more parameter and was trained more data can also provide optimal results.
|
40 |
|
|
|
|
38 |
1. Embedding : As the project has made use of readily available huggingface embeddings, it has max dimension of 768. We can make use of alternate embeddings such as HuggingFaceInstructEmbeddings, Ollama embeddings which are open-source or OpenAI embeddings.
|
39 |
2. LLM : Making use of llm which has more parameter and was trained more data can also provide optimal results.
|
40 |
|
41 |
+
# Hugging Face Space Here : [Space](https://huggingface.co/spaces/yogjoshi14/chat_with_documents)
|
app.py
CHANGED
@@ -11,7 +11,8 @@ from langchain_community.vectorstores import Pinecone
|
|
11 |
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
|
12 |
|
13 |
import streamlit as st
|
14 |
-
|
|
|
15 |
|
16 |
st.set_page_config(page_title="chatbot")
|
17 |
st.title("Chat with Documents")
|
@@ -23,7 +24,7 @@ CHUNK_OVERLAP = 50
|
|
23 |
embedding_dim = 768
|
24 |
|
25 |
# Initialize Pinecone
|
26 |
-
pc = pinecone.Pinecone(api_key=os.environ
|
27 |
index_name = "qp-ai-assessment"
|
28 |
|
29 |
|
@@ -40,16 +41,50 @@ def recreate_index():
|
|
40 |
name=index_name,
|
41 |
metric='cosine',
|
42 |
dimension=embedding_dim,
|
43 |
-
spec=pinecone.PodSpec(os.environ
|
44 |
)
|
45 |
print(f"Created new index: {index_name}")
|
46 |
|
47 |
-
def
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
return text
|
54 |
|
55 |
|
@@ -82,15 +117,10 @@ def query_llm(retriever, query):
|
|
82 |
def input_fields():
|
83 |
#
|
84 |
with st.sidebar:
|
85 |
-
#
|
86 |
-
# if "openai_api_key" in st.secrets:
|
87 |
-
# st.session_state.openai_api_key = st.secrets.openai_api_key
|
88 |
-
# else:
|
89 |
-
# st.session_state.openai_api_key = st.text_input("OpenAI API key", type="password")
|
90 |
|
91 |
-
st.session_state.pinecone_api_key = os.environ
|
92 |
# st.text_input("Pinecone API key", type="password")
|
93 |
-
st.session_state.pinecone_env = os.environ
|
94 |
# st.text_input("Pinecone environment")
|
95 |
st.session_state.pinecone_index = index_name
|
96 |
# st.text_input("Pinecone index name")
|
|
|
11 |
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
|
12 |
|
13 |
import streamlit as st
|
14 |
+
from docx import Document
|
15 |
+
import textract
|
16 |
|
17 |
st.set_page_config(page_title="chatbot")
|
18 |
st.title("Chat with Documents")
|
|
|
24 |
embedding_dim = 768
|
25 |
|
26 |
# Initialize Pinecone
|
27 |
+
pc = pinecone.Pinecone(api_key=os.environ("PINECONE_API_KEY"))
|
28 |
index_name = "qp-ai-assessment"
|
29 |
|
30 |
|
|
|
41 |
name=index_name,
|
42 |
metric='cosine',
|
43 |
dimension=embedding_dim,
|
44 |
+
spec=pinecone.PodSpec(os.environ("PINECONE_ENV")) # 1536 dim of text-embedding-ada-002
|
45 |
)
|
46 |
print(f"Created new index: {index_name}")
|
47 |
|
48 |
+
def get_text_from_pdf(pdf):
|
49 |
+
pdf_reader = PdfReader(pdf)
|
50 |
+
text = ""
|
51 |
+
for page in pdf_reader.pages:
|
52 |
+
text += page.extract_text()
|
53 |
+
return text
|
54 |
+
|
55 |
+
def get_text_from_docx(docx):
|
56 |
+
doc = Document(docx)
|
57 |
+
text = ""
|
58 |
+
for paragraph in doc.paragraphs:
|
59 |
+
text += paragraph.text + "\n"
|
60 |
+
return text
|
61 |
+
|
62 |
+
def get_text_from_text_file(text_file):
|
63 |
+
with open(text_file, 'r', encoding='utf-8') as file:
|
64 |
+
text = file.read()
|
65 |
+
return text
|
66 |
+
|
67 |
+
def get_text_from_other_file(file_path):
|
68 |
+
try:
|
69 |
+
text = textract.process(file_path, method='pdftotext').decode('utf-8')
|
70 |
+
return text
|
71 |
+
except Exception as e:
|
72 |
+
print(f"Error extracting text from {file_path}: {e}")
|
73 |
+
return ""
|
74 |
+
|
75 |
+
def load_documents(docs):
|
76 |
+
text = ""
|
77 |
+
for doc in docs:
|
78 |
+
if doc.name.lower().endswith('.pdf'):
|
79 |
+
text += get_text_from_pdf(doc)
|
80 |
+
elif doc.name.lower().endswith('.docx'):
|
81 |
+
text += get_text_from_docx(doc)
|
82 |
+
elif doc.name.lower().endswith(('.txt', '.md')):
|
83 |
+
text += get_text_from_text_file(doc)
|
84 |
+
else:
|
85 |
+
# Handle other file types, you can extend this as needed
|
86 |
+
text += get_text_from_other_file(doc)
|
87 |
+
|
88 |
return text
|
89 |
|
90 |
|
|
|
117 |
def input_fields():
|
118 |
#
|
119 |
with st.sidebar:
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
st.session_state.pinecone_api_key = os.environ("PINECONE_API_KEY")
|
122 |
# st.text_input("Pinecone API key", type="password")
|
123 |
+
st.session_state.pinecone_env = os.environ("PINECONE_ENV")
|
124 |
# st.text_input("Pinecone environment")
|
125 |
st.session_state.pinecone_index = index_name
|
126 |
# st.text_input("Pinecone index name")
|