yogjoshi14 commited on
Commit
696ea0a
1 Parent(s): 3b1a154

making document load changes

Browse files
Files changed (2) hide show
  1. README.md +1 -0
  2. app.py +46 -16
README.md CHANGED
@@ -38,3 +38,4 @@ Advanced Challenge:
38
  1. Embedding : As the project has made use of readily available huggingface embeddings, it has max dimension of 768. We can make use of alternate embeddings such as HuggingFaceInstructEmbeddings, Ollama embeddings which are open-source or OpenAI embeddings.
39
  2. LLM : Making use of llm which has more parameter and was trained more data can also provide optimal results.
40
 
 
 
38
  1. Embedding : As the project has made use of readily available huggingface embeddings, it has max dimension of 768. We can make use of alternate embeddings such as HuggingFaceInstructEmbeddings, Ollama embeddings which are open-source or OpenAI embeddings.
39
  2. LLM : Making use of llm which has more parameter and was trained more data can also provide optimal results.
40
 
41
+ # Hugging Face Space Here : [Space](https://huggingface.co/spaces/yogjoshi14/chat_with_documents)
app.py CHANGED
@@ -11,7 +11,8 @@ from langchain_community.vectorstores import Pinecone
11
  from langchain_community.chat_message_histories import StreamlitChatMessageHistory
12
 
13
  import streamlit as st
14
-
 
15
 
16
  st.set_page_config(page_title="chatbot")
17
  st.title("Chat with Documents")
@@ -23,7 +24,7 @@ CHUNK_OVERLAP = 50
23
  embedding_dim = 768
24
 
25
  # Initialize Pinecone
26
- pc = pinecone.Pinecone(api_key=os.environ.getattribute("PINECONE_API_KEY"))
27
  index_name = "qp-ai-assessment"
28
 
29
 
@@ -40,16 +41,50 @@ def recreate_index():
40
  name=index_name,
41
  metric='cosine',
42
  dimension=embedding_dim,
43
- spec=pinecone.PodSpec(os.environ.getattribute("PINECONE_ENV")) # 1536 dim of text-embedding-ada-002
44
  )
45
  print(f"Created new index: {index_name}")
46
 
47
- def load_documents(pdf_docs):
48
- text = ""
49
- for pdf in pdf_docs:
50
- pdf_reader = PdfReader(pdf)
51
- for page in pdf_reader.pages:
52
- text += page.extract_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return text
54
 
55
 
@@ -82,15 +117,10 @@ def query_llm(retriever, query):
82
  def input_fields():
83
  #
84
  with st.sidebar:
85
- #
86
- # if "openai_api_key" in st.secrets:
87
- # st.session_state.openai_api_key = st.secrets.openai_api_key
88
- # else:
89
- # st.session_state.openai_api_key = st.text_input("OpenAI API key", type="password")
90
 
91
- st.session_state.pinecone_api_key = os.environ.getattribute("PINECONE_API_KEY")
92
  # st.text_input("Pinecone API key", type="password")
93
- st.session_state.pinecone_env = os.environ.getattribute("PINECONE_ENV")
94
  # st.text_input("Pinecone environment")
95
  st.session_state.pinecone_index = index_name
96
  # st.text_input("Pinecone index name")
 
11
  from langchain_community.chat_message_histories import StreamlitChatMessageHistory
12
 
13
  import streamlit as st
14
+ from docx import Document
15
+ import textract
16
 
17
  st.set_page_config(page_title="chatbot")
18
  st.title("Chat with Documents")
 
24
  embedding_dim = 768
25
 
26
  # Initialize Pinecone
27
+ pc = pinecone.Pinecone(api_key=os.environ("PINECONE_API_KEY"))
28
  index_name = "qp-ai-assessment"
29
 
30
 
 
41
  name=index_name,
42
  metric='cosine',
43
  dimension=embedding_dim,
44
+ spec=pinecone.PodSpec(os.environ("PINECONE_ENV")) # 1536 dim of text-embedding-ada-002
45
  )
46
  print(f"Created new index: {index_name}")
47
 
48
+ def get_text_from_pdf(pdf):
49
+ pdf_reader = PdfReader(pdf)
50
+ text = ""
51
+ for page in pdf_reader.pages:
52
+ text += page.extract_text()
53
+ return text
54
+
55
+ def get_text_from_docx(docx):
56
+ doc = Document(docx)
57
+ text = ""
58
+ for paragraph in doc.paragraphs:
59
+ text += paragraph.text + "\n"
60
+ return text
61
+
62
+ def get_text_from_text_file(text_file):
63
+ with open(text_file, 'r', encoding='utf-8') as file:
64
+ text = file.read()
65
+ return text
66
+
67
+ def get_text_from_other_file(file_path):
68
+ try:
69
+ text = textract.process(file_path, method='pdftotext').decode('utf-8')
70
+ return text
71
+ except Exception as e:
72
+ print(f"Error extracting text from {file_path}: {e}")
73
+ return ""
74
+
75
+ def load_documents(docs):
76
+ text = ""
77
+ for doc in docs:
78
+ if doc.name.lower().endswith('.pdf'):
79
+ text += get_text_from_pdf(doc)
80
+ elif doc.name.lower().endswith('.docx'):
81
+ text += get_text_from_docx(doc)
82
+ elif doc.name.lower().endswith(('.txt', '.md')):
83
+ text += get_text_from_text_file(doc)
84
+ else:
85
+ # Handle other file types, you can extend this as needed
86
+ text += get_text_from_other_file(doc)
87
+
88
  return text
89
 
90
 
 
117
  def input_fields():
118
  #
119
  with st.sidebar:
 
 
 
 
 
120
 
121
+ st.session_state.pinecone_api_key = os.environ("PINECONE_API_KEY")
122
  # st.text_input("Pinecone API key", type="password")
123
+ st.session_state.pinecone_env = os.environ("PINECONE_ENV")
124
  # st.text_input("Pinecone environment")
125
  st.session_state.pinecone_index = index_name
126
  # st.text_input("Pinecone index name")