Kevin Hu commited on
Commit
d624a4e
·
1 Parent(s): 1391909

rename get_txt to get_text (#2649)

Browse files

### What problem does this PR solve?



### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

deepdoc/parser/utils.py CHANGED
@@ -14,7 +14,7 @@
14
  from rag.nlp import find_codec
15
 
16
 
17
- def get_txt(fnm: str, binary=None) -> str:
18
  txt = ""
19
  if binary:
20
  encoding = find_codec(binary)
 
14
  from rag.nlp import find_codec
15
 
16
 
17
+ def get_text(fnm: str, binary=None) -> str:
18
  txt = ""
19
  if binary:
20
  encoding = find_codec(binary)
rag/app/book.py CHANGED
@@ -10,7 +10,6 @@
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
13
- import copy
14
  from tika import parser
15
  import re
16
  from io import BytesIO
 
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
 
13
  from tika import parser
14
  import re
15
  from io import BytesIO
rag/app/laws.py CHANGED
@@ -17,7 +17,7 @@ from io import BytesIO
17
  from docx import Document
18
 
19
  from api.db import ParserType
20
- from deepdoc.parser.utils import get_txt
21
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
22
  make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
23
  from rag.nlp import rag_tokenizer
@@ -166,7 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
166
 
167
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
168
  callback(0.1, "Start to parse.")
169
- txt = get_txt(filename, binary)
170
  sections = txt.split("\n")
171
  sections = [l for l in sections if l]
172
  callback(0.8, "Finish parsing.")
 
17
  from docx import Document
18
 
19
  from api.db import ParserType
20
+ from deepdoc.parser.utils import get_text
21
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
22
  make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
23
  from rag.nlp import rag_tokenizer
 
166
 
167
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
168
  callback(0.1, "Start to parse.")
169
+ txt = get_text(filename, binary)
170
  sections = txt.split("\n")
171
  sections = [l for l in sections if l]
172
  callback(0.8, "Finish parsing.")
rag/app/one.py CHANGED
@@ -14,9 +14,9 @@ from tika import parser
14
  from io import BytesIO
15
  import re
16
 
17
- from deepdoc.parser.utils import get_txt
18
  from rag.app import laws
19
- from rag.nlp import rag_tokenizer, tokenize, find_codec
20
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
21
 
22
 
@@ -84,7 +84,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
84
 
85
  elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
86
  callback(0.1, "Start to parse.")
87
- txt = get_txt(filename, binary)
88
  sections = txt.split("\n")
89
  sections = [s for s in sections if s]
90
  callback(0.8, "Finish parsing.")
 
14
  from io import BytesIO
15
  import re
16
 
17
+ from deepdoc.parser.utils import get_text
18
  from rag.app import laws
19
+ from rag.nlp import rag_tokenizer, tokenize
20
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
21
 
22
 
 
84
 
85
  elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
86
  callback(0.1, "Start to parse.")
87
+ txt = get_text(filename, binary)
88
  sections = txt.split("\n")
89
  sections = [s for s in sections if s]
90
  callback(0.8, "Finish parsing.")
rag/app/qa.py CHANGED
@@ -17,14 +17,16 @@ from timeit import default_timer as timer
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
19
 
20
- from deepdoc.parser.utils import get_txt
21
- from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
22
  from rag.nlp import rag_tokenizer, tokenize_table, concat_img
23
  from rag.settings import cron_logger
24
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
25
  from docx import Document
26
  from PIL import Image
27
  from markdown import markdown
 
 
28
  class Excel(ExcelParser):
29
  def __call__(self, fnm, binary=None, callback=None):
30
  if not binary:
@@ -307,7 +309,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
307
  return res
308
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
309
  callback(0.1, "Start to parse.")
310
- txt = get_txt(filename, binary)
311
  lines = txt.split("\n")
312
  comma, tab = 0, 0
313
  for l in lines:
@@ -350,7 +352,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
350
  return res
351
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
352
  callback(0.1, "Start to parse.")
353
- txt = get_txt(filename, binary)
354
  lines = txt.split("\n")
355
  last_question, last_answer = "", ""
356
  question_stack, level_stack = [], []
 
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
19
 
20
+ from deepdoc.parser.utils import get_text
21
+ from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
22
  from rag.nlp import rag_tokenizer, tokenize_table, concat_img
23
  from rag.settings import cron_logger
24
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
25
  from docx import Document
26
  from PIL import Image
27
  from markdown import markdown
28
+
29
+
30
  class Excel(ExcelParser):
31
  def __call__(self, fnm, binary=None, callback=None):
32
  if not binary:
 
309
  return res
310
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
311
  callback(0.1, "Start to parse.")
312
+ txt = get_text(filename, binary)
313
  lines = txt.split("\n")
314
  comma, tab = 0, 0
315
  for l in lines:
 
352
  return res
353
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
354
  callback(0.1, "Start to parse.")
355
+ txt = get_text(filename, binary)
356
  lines = txt.split("\n")
357
  last_question, last_answer = "", ""
358
  question_stack, level_stack = [], []
rag/app/table.py CHANGED
@@ -21,7 +21,7 @@ from dateutil.parser import parse as datetime_parse
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
23
  from deepdoc.parser.utils import get_text
24
- from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
25
  from deepdoc.parser import ExcelParser
26
 
27
 
 
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
23
  from deepdoc.parser.utils import get_text
24
+ from rag.nlp import rag_tokenizer, tokenize
25
  from deepdoc.parser import ExcelParser
26
 
27