Samarth991 commited on
Commit
49dbc00
1 Parent(s): 56bcbd9

adding online PDF loader

Browse files
Files changed (1) hide show
  1. app.py +3 -5
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import gradio as gr
3
 
4
  from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.embeddings import SentenceTransformerEmbeddings
7
  from langchain.vectorstores import FAISS
8
  from langchain import HuggingFaceHub
@@ -27,7 +27,7 @@ def get_openai_chat_model(API_key):
27
  return llm
28
 
29
  def process_documents(documents,data_chunk=1000,chunk_overlap=50):
30
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap)
31
  texts = text_splitter.split_documents(documents)
32
  return texts
33
 
@@ -56,7 +56,6 @@ def document_loader(file_path,api_key,doc_type='pdf',llm='Huggingface'):
56
  elif doc_type == 'word':
57
  document = process_word_document(document_file=file_path)
58
  if document:
59
- print("Document :",document)
60
  texts = process_documents(documents=document)
61
  vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
62
  global qa
@@ -77,7 +76,6 @@ def process_text_document(document_file):
77
  document = loader.load()
78
  return document
79
 
80
-
81
  def process_csv_document(document_file):
82
  loader = CSVLoader(file_path=document_file.name)
83
  document = loader.load()
@@ -94,7 +92,7 @@ def process_pdf_document(document_file):
94
  print("Document File Name :",document_file.name)
95
  loader = PDFMinerLoader(document_file.name)
96
  document = loader.load()
97
- return document[0]
98
 
99
 
100
 
 
2
  import gradio as gr
3
 
4
  from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
  from langchain.embeddings import SentenceTransformerEmbeddings
7
  from langchain.vectorstores import FAISS
8
  from langchain import HuggingFaceHub
 
27
  return llm
28
 
29
  def process_documents(documents,data_chunk=1000,chunk_overlap=50):
30
+ text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n')
31
  texts = text_splitter.split_documents(documents)
32
  return texts
33
 
 
56
  elif doc_type == 'word':
57
  document = process_word_document(document_file=file_path)
58
  if document:
 
59
  texts = process_documents(documents=document)
60
  vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
61
  global qa
 
76
  document = loader.load()
77
  return document
78
 
 
79
  def process_csv_document(document_file):
80
  loader = CSVLoader(file_path=document_file.name)
81
  document = loader.load()
 
92
  print("Document File Name :",document_file.name)
93
  loader = PDFMinerLoader(document_file.name)
94
  document = loader.load()
95
+ return document
96
 
97
 
98