Spaces:
Sleeping
Sleeping
File size: 1,148 Bytes
ce24d59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from haystack.utils import fetch_archive_from_http
from haystack.document_stores import ElasticsearchDocumentStore
import os
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor
doc_dir = "data/JBNU-FOCUS"
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
print(host)
document_store = ElasticsearchDocumentStore(
host='121.186.58.11',
username="",
password="",
index="document"
)
indexing_pipeline = Pipeline()
text_converter = TextConverter()
preprocessor = PreProcessor(
clean_whitespace=True,
clean_header_footer=True,
clean_empty_lines=True,
split_by="word",
split_length=200,
split_overlap=20,
split_respect_sentence_boundary=True,
)
import os
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline.run_batch(file_paths=files_to_index) |