kausthubkannan17 commited on
Commit
8163d1a
1 Parent(s): 1a6ee56

feat: OCR support

Browse files
Files changed (5) hide show
  1. model.py +2 -15
  2. pages/upload_file.py +36 -35
  3. pages/upload_url.py +1 -3
  4. requirements.txt +2 -2
  5. utilis.py +26 -0
model.py CHANGED
@@ -9,24 +9,15 @@ from langchain_core.documents.base import Document
9
 
10
 
11
  class DrakeLM:
12
- def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
13
  """
14
  Parameters:
15
  model_path (str): The path to the model in case running Llama
16
  db (DeepLake): The DeepLake DB object
17
  config (dict): The configuration for the llama model
18
- llm_model (str): The LLM model type
19
 
20
  Initialize the DrakeLM model
21
  """
22
- self.llm_model = llm_model
23
-
24
- if llm_model == "llama":
25
- self.llama = CTransformers(
26
- model=model_path,
27
- model_type="llama",
28
- config=config
29
- )
30
  self.gemini = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
31
  self.retriever = db.as_retriever()
32
  self.chat_history = ChatMessageHistory()
@@ -123,11 +114,7 @@ class DrakeLM:
123
  """
124
 
125
  prompt_template = self.chat_prompt.format(query=query, context=context, rules=rules)
126
-
127
- if self.llm_model == "llama":
128
- self.chat_history.add_ai_message(AIMessage(content=self.llama.invoke(prompt_template)))
129
- else:
130
- self.chat_history.add_ai_message(AIMessage(content=self.gemini.invoke(prompt_template).content))
131
 
132
  return self.chat_history.messages[-1].content
133
 
 
9
 
10
 
11
  class DrakeLM:
12
+ def __init__(self, model_path: str, db: DeepLake, config: dict):
13
  """
14
  Parameters:
15
  model_path (str): The path to the model in case running Llama
16
  db (DeepLake): The DeepLake DB object
17
  config (dict): The configuration for the llama model
 
18
 
19
  Initialize the DrakeLM model
20
  """
 
 
 
 
 
 
 
 
21
  self.gemini = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
22
  self.retriever = db.as_retriever()
23
  self.chat_history = ChatMessageHistory()
 
114
  """
115
 
116
  prompt_template = self.chat_prompt.format(query=query, context=context, rules=rules)
117
+ self.chat_history.add_ai_message(AIMessage(content=self.gemini.invoke(prompt_template).content))
 
 
 
 
118
 
119
  return self.chat_history.messages[-1].content
120
 
pages/upload_file.py CHANGED
@@ -15,10 +15,9 @@ if st.button("Youtube/Video URL"):
15
 
16
  st.subheader('Upload the file')
17
  uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
 
18
  allow_make_notes = st.toggle('Make Complete Notes!')
19
- llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
20
- st.caption("Note: Llama support to be added soon!")
21
- drake.llm_model = llm_model
22
 
23
 
24
  if uploaded_file:
@@ -27,45 +26,47 @@ if uploaded_file:
27
  # Chunking the file
28
  with st.spinner('Please wait, file is chunking ...'):
29
  try:
30
- pdf_stream = io.BytesIO(uploaded_file.read())
31
- pdf_reader = PyPDF2.PdfReader(pdf_stream)
32
 
33
- text = ""
34
- for page in pdf_reader.pages:
35
- text += page.extract_text()
 
 
 
 
36
 
37
- documents, metadata = processing.load_pdf("hello world", text)
38
  st.session_state["metadata"] = metadata
39
- st.success("Successfully chunked the file")
40
 
41
  except Exception as e:
42
  st.error("Error in chunking")
43
 
44
- # Uploading to DB
45
- with st.spinner('Please wait, file is uploading ...'):
 
 
 
 
 
 
 
 
 
46
  try:
47
- processing.upload_to_db(documents)
 
 
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
- st.error("Error in uploading")
50
-
51
- # Generating Notes
52
- if allow_make_notes:
53
- with st.spinner('Please wait, notes are being generated ...'):
54
- try:
55
- config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
56
- notes = drake.create_notes(documents)
57
- encoded_text = notes.encode('utf-8')
58
- st.success("Notes generated successfully")
59
- if st.download_button(
60
- label="Download data as Markdown",
61
- data=encoded_text,
62
- file_name='your_notes.md',
63
- mime='text/markdown',
64
- ):
65
- st.switch_page("pages/chat.py")
66
- except Exception as e:
67
- print(e)
68
- st.error("Error in generating notes")
69
 
70
- else:
71
- st.switch_page("pages/chat.py")
 
15
 
16
  st.subheader('Upload the file')
17
  uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
18
+ is_scanned = st.toggle("Is the file scanned?")
19
  allow_make_notes = st.toggle('Make Complete Notes!')
20
+ st.caption("Note: Currently, Drake support Gemini, Llama support to be added soon!")
 
 
21
 
22
 
23
  if uploaded_file:
 
26
  # Chunking the file
27
  with st.spinner('Please wait, file is chunking ...'):
28
  try:
29
+ pdf_stream = io.BytesIO(uploaded_file.getvalue())
 
30
 
31
+ if is_scanned:
32
+ text = processing.load_scanned_pdf(uploaded_file.getvalue())
33
+ else:
34
+ pdf_reader = PyPDF2.PdfReader(pdf_stream)
35
+ text = ""
36
+ for page in pdf_reader.pages:
37
+ text += page.extract_text()
38
 
39
+ documents, metadata = processing.load_pdf(text)
40
  st.session_state["metadata"] = metadata
 
41
 
42
  except Exception as e:
43
  st.error("Error in chunking")
44
 
45
+ # Uploading to DB
46
+ with st.spinner('Please wait, documents uploading ...'):
47
+ try:
48
+ processing.upload_to_db(documents)
49
+ st.success("Successfully uploaded the file")
50
+ except Exception as e:
51
+ st.error("Error in uploading")
52
+
53
+ # Generating Notes
54
+ if allow_make_notes:
55
+ with st.spinner('Please wait, notes are being generated ...'):
56
  try:
57
+ config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
58
+ notes = drake.create_notes(documents)
59
+ encoded_text = notes.encode('utf-8')
60
+ st.success("Notes generated successfully")
61
+ if st.download_button(
62
+ label="Download your notes",
63
+ data=encoded_text,
64
+ file_name='your_notes.md',
65
+ mime='text/markdown',
66
+ ):
67
+ st.switch_page("pages/chat.py")
68
  except Exception as e:
69
+ st.error("Error in generating notes", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ else:
72
+ st.switch_page("pages/chat.py")
pages/upload_url.py CHANGED
@@ -13,9 +13,7 @@ if st.button("PDF/Transcript"):
13
 
14
  st.subheader('Enter the Video URL')
15
  video_url = st.text_input(label="Enter the URL")
16
- llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
17
- st.caption("Note: Llama support to be added soon!")
18
- drake.llm_model = llm_model
19
 
20
  allow_make_notes = st.toggle('Make Complete Notes!')
21
 
 
13
 
14
  st.subheader('Enter the Video URL')
15
  video_url = st.text_input(label="Enter the URL")
16
+ st.caption("Note: Currently, Drake support Gemini, Llama support to be added soon!")
 
 
17
 
18
  allow_make_notes = st.toggle('Make Complete Notes!')
19
 
requirements.txt CHANGED
@@ -1,10 +1,10 @@
1
  PyPDF2
 
 
2
  streamlit
3
  langchain
4
  deeplake
5
  assemblyai
6
  sentence-transformers
7
  youtube-transcript-api
8
- modal
9
- ctransformers
10
  langchain-google-genai
 
1
  PyPDF2
2
+ pdf2image
3
+ pytesseract
4
  streamlit
5
  langchain
6
  deeplake
7
  assemblyai
8
  sentence-transformers
9
  youtube-transcript-api
 
 
10
  langchain-google-genai
utilis.py CHANGED
@@ -11,6 +11,9 @@ from langchain.prompts.few_shot import FewShotPromptTemplate
11
  from langchain.prompts.prompt import PromptTemplate
12
  from typing import Dict
13
  import uuid
 
 
 
14
 
15
 
16
  class Processing:
@@ -75,6 +78,29 @@ class Processing:
75
  print("Created document chunks")
76
  return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def load_transcript(self, url) -> (List[Document], Dict[str, str]):
79
  """
80
  Returns:
 
11
  from langchain.prompts.prompt import PromptTemplate
12
  from typing import Dict
13
  import uuid
14
+ from pdf2image import convert_from_bytes
15
+ import pytesseract
16
+ from pytesseract import Output
17
 
18
 
19
  class Processing:
 
78
  print("Created document chunks")
79
  return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")
80
 
81
+ def load_scanned_pdf(self, file) -> str:
82
+ """
83
+ Parameters:
84
+ file (File): Scanned PDF file to be processed
85
+
86
+ Returns:
87
+ str: Text extracted from the scanned PDF file
88
+
89
+ Extract text from scanned PDF file
90
+ """
91
+ images = convert_from_bytes(file)
92
+
93
+ all_text = ""
94
+ for image in images:
95
+ # Perform OCR on the image
96
+ text = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
97
+
98
+ # Extract text from the dictionary
99
+ page_text = " ".join(text['text'])
100
+ all_text += page_text + "\n"
101
+
102
+ return all_text
103
+
104
  def load_transcript(self, url) -> (List[Document], Dict[str, str]):
105
  """
106
  Returns: