peter2000 commited on
Commit
bbe4709
1 Parent(s): c2c2862

Update scripts/process.py

Browse files
Files changed (1) hide show
  1. scripts/process.py +10 -19
scripts/process.py CHANGED
@@ -58,39 +58,29 @@ def load_document(
58
  id_hash_keys=id_hash_keys))
59
 
60
  return documents
61
-
62
- def preprocessing(document,
63
- split_by: Literal["sentence", "word"] = 'sentence',
64
- split_length:int = 3):
65
-
66
  """
67
- takes in haystack document object and splits it into synthetically generated paragraphs and applies simple cleaning.
68
  Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
69
  list that contains all text joined together.
70
  """
71
- if split_by == 'sentence':
72
- split_respect_sentence_boundary = False
73
- split_overlap=0
74
- else:
75
- split_respect_sentence_boundary = True
76
- split_overlap= 20
77
-
78
  preprocessor = PreProcessor(
79
  clean_empty_lines=True,
80
  clean_whitespace=True,
81
  clean_header_footer=True,
82
- split_by=split_by,
83
- split_length=split_length,
84
- split_respect_sentence_boundary= split_respect_sentence_boundary,
85
- split_overlap=split_overlap
86
  )
87
  for i in document:
88
  docs_processed = preprocessor.process([i])
89
  for item in docs_processed:
90
  item.content = basic(item.content)
91
 
92
- print("\n your document has been splitted to", len(docs_processed), "paragraphs")
93
- # logger.info("document has been splitted to {}".format(len(docs_processed)))
94
 
95
  # create dataframe of text and list of all text
96
  #df = pd.DataFrame(docs_processed)
@@ -98,5 +88,6 @@ def load_document(
98
  #par_list = df.content.to_list()
99
 
100
  return docs_processed #, df, all_text, par_list
 
101
 
102
 
 
58
  id_hash_keys=id_hash_keys))
59
 
60
  return documents
61
+
62
+ def preprocessing(document):
 
 
 
63
  """
64
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
65
  Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
66
  list that contains all text joined together.
67
  """
68
+
 
 
 
 
 
 
69
  preprocessor = PreProcessor(
70
  clean_empty_lines=True,
71
  clean_whitespace=True,
72
  clean_header_footer=True,
73
+ split_by="sentence",
74
+ split_length=3,
75
+ split_respect_sentence_boundary=False,
76
+ split_overlap=1
77
  )
78
  for i in document:
79
  docs_processed = preprocessor.process([i])
80
  for item in docs_processed:
81
  item.content = basic(item.content)
82
 
83
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
 
84
 
85
  # create dataframe of text and list of all text
86
  #df = pd.DataFrame(docs_processed)
 
88
  #par_list = df.content.to_list()
89
 
90
  return docs_processed #, df, all_text, par_list
91
+
92
 
93