KevinHuSh commited on
Commit
b5b25b4
·
1 Parent(s): 1d93b24

refine text decode (#657)

Browse files

### What problem does this PR solve?
#651

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

deepdoc/parser/excel_parser.py CHANGED
@@ -69,7 +69,7 @@ class RAGFlowExcelParser:
69
 
70
  if fnm.split(".")[-1].lower() in ["csv", "txt"]:
71
  encoding = find_codec(binary)
72
- txt = binary.decode(encoding)
73
  return len(txt.split("\n"))
74
 
75
 
 
69
 
70
  if fnm.split(".")[-1].lower() in ["csv", "txt"]:
71
  encoding = find_codec(binary)
72
+ txt = binary.decode(encoding, errors="ignore")
73
  return len(txt.split("\n"))
74
 
75
 
rag/app/book.py CHANGED
@@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
91
  txt = ""
92
  if binary:
93
  encoding = find_codec(binary)
94
- txt = binary.decode(encoding)
95
  else:
96
  with open(filename, "r") as f:
97
  while True:
 
91
  txt = ""
92
  if binary:
93
  encoding = find_codec(binary)
94
+ txt = binary.decode(encoding, errors="ignore")
95
  else:
96
  with open(filename, "r") as f:
97
  while True:
rag/app/laws.py CHANGED
@@ -113,7 +113,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
113
  txt = ""
114
  if binary:
115
  encoding = find_codec(binary)
116
- txt = binary.decode(encoding)
117
  else:
118
  with open(filename, "r") as f:
119
  while True:
 
113
  txt = ""
114
  if binary:
115
  encoding = find_codec(binary)
116
+ txt = binary.decode(encoding, errors="ignore")
117
  else:
118
  with open(filename, "r") as f:
119
  while True:
rag/app/naive.py CHANGED
@@ -141,7 +141,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
141
  txt = ""
142
  if binary:
143
  encoding = find_codec(binary)
144
- txt = binary.decode(encoding)
145
  else:
146
  with open(filename, "r") as f:
147
  while True:
 
141
  txt = ""
142
  if binary:
143
  encoding = find_codec(binary)
144
+ txt = binary.decode(encoding, errors="ignore")
145
  else:
146
  with open(filename, "r") as f:
147
  while True:
rag/app/one.py CHANGED
@@ -85,7 +85,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
85
  txt = ""
86
  if binary:
87
  encoding = find_codec(binary)
88
- txt = binary.decode(encoding)
89
  else:
90
  with open(filename, "r") as f:
91
  while True:
 
85
  txt = ""
86
  if binary:
87
  encoding = find_codec(binary)
88
+ txt = binary.decode(encoding, errors="ignore")
89
  else:
90
  with open(filename, "r") as f:
91
  while True:
rag/app/qa.py CHANGED
@@ -107,7 +107,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
107
  txt = ""
108
  if binary:
109
  encoding = find_codec(binary)
110
- txt = binary.decode(encoding)
111
  else:
112
  with open(filename, "r") as f:
113
  while True:
 
107
  txt = ""
108
  if binary:
109
  encoding = find_codec(binary)
110
+ txt = binary.decode(encoding, errors="ignore")
111
  else:
112
  with open(filename, "r") as f:
113
  while True:
rag/app/table.py CHANGED
@@ -149,7 +149,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
149
  txt = ""
150
  if binary:
151
  encoding = find_codec(binary)
152
- txt = binary.decode(encoding)
153
  else:
154
  with open(filename, "r") as f:
155
  while True:
 
149
  txt = ""
150
  if binary:
151
  encoding = find_codec(binary)
152
+ txt = binary.decode(encoding, errors="ignore")
153
  else:
154
  with open(filename, "r") as f:
155
  while True: