karshreya98
commited on
Commit
•
2e4daca
1
Parent(s):
e2fe55a
made corrections for preprocessing
Browse files- app.py +3 -2
- utils/haystack.py +1 -1
app.py
CHANGED
@@ -8,7 +8,7 @@ from json import JSONDecodeError
|
|
8 |
from markdown import markdown
|
9 |
from utils.config import parser
|
10 |
from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query, start_preprocessor_node, start_retriever, start_reader
|
11 |
-
from utils.ui import reset_results, set_initial_state
|
12 |
|
13 |
# Sliders
|
14 |
DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
|
@@ -34,7 +34,7 @@ def upload_files():
|
|
34 |
|
35 |
def process_file(data_file, preprocesor, document_store):
|
36 |
# read file and add content
|
37 |
-
file_contents = data_file.read()
|
38 |
docs = [{
|
39 |
'content': str(file_contents),
|
40 |
'meta': {'name': str(data_file.name)}
|
@@ -47,6 +47,7 @@ def process_file(data_file, preprocesor, document_store):
|
|
47 |
print(f"{data_file.name} already processed")
|
48 |
else:
|
49 |
print(f'preprocessing uploaded doc {data_file.name}.......')
|
|
|
50 |
preprocessed_docs = preprocesor.process(docs)
|
51 |
print('writing to document store.......')
|
52 |
document_store.write_documents(preprocessed_docs)
|
|
|
8 |
from markdown import markdown
|
9 |
from utils.config import parser
|
10 |
from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query, start_preprocessor_node, start_retriever, start_reader
|
11 |
+
from utils.ui import reset_results, set_initial_state
|
12 |
|
13 |
# Sliders
|
14 |
DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
|
|
|
34 |
|
35 |
def process_file(data_file, preprocesor, document_store):
|
36 |
# read file and add content
|
37 |
+
file_contents = data_file.read().decode("utf-8")
|
38 |
docs = [{
|
39 |
'content': str(file_contents),
|
40 |
'meta': {'name': str(data_file.name)}
|
|
|
47 |
print(f"{data_file.name} already processed")
|
48 |
else:
|
49 |
print(f'preprocessing uploaded doc {data_file.name}.......')
|
50 |
+
#print(data_file.read().decode("utf-8"))
|
51 |
preprocessed_docs = preprocesor.process(docs)
|
52 |
print('writing to document store.......')
|
53 |
document_store.write_documents(preprocessed_docs)
|
utils/haystack.py
CHANGED
@@ -13,7 +13,7 @@ from milvus_haystack import MilvusDocumentStore
|
|
13 |
def start_preprocessor_node():
|
14 |
print('initializing preprocessor node')
|
15 |
processor = PreProcessor(
|
16 |
-
clean_empty_lines=True,
|
17 |
clean_whitespace=True,
|
18 |
clean_header_footer=True,
|
19 |
#remove_substrings=None,
|
|
|
13 |
def start_preprocessor_node():
|
14 |
print('initializing preprocessor node')
|
15 |
processor = PreProcessor(
|
16 |
+
clean_empty_lines= True,
|
17 |
clean_whitespace=True,
|
18 |
clean_header_footer=True,
|
19 |
#remove_substrings=None,
|