yangdx
commited on
Commit
·
1eb4cbb
1
Parent(s):
0009218
Sync modifications from main branch
Browse files
lightrag/api/routers/document_routes.py
CHANGED
@@ -117,6 +117,37 @@ class DocumentManager:
|
|
117 |
".docx",
|
118 |
".pptx",
|
119 |
".xlsx",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
),
|
121 |
):
|
122 |
self.input_dir = Path(input_dir)
|
@@ -170,7 +201,41 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|
170 |
|
171 |
# Process based on file type
|
172 |
match ext:
|
173 |
-
case
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
content = file.decode("utf-8")
|
175 |
case ".pdf":
|
176 |
if not pm.is_installed("pypdf2"):
|
|
|
117 |
".docx",
|
118 |
".pptx",
|
119 |
".xlsx",
|
120 |
+
".rtf", # Rich Text Format
|
121 |
+
".odt", # OpenDocument Text
|
122 |
+
".tex", # LaTeX
|
123 |
+
".epub", # Electronic Publication
|
124 |
+
".html", # HyperText Markup Language
|
125 |
+
".htm", # HyperText Markup Language
|
126 |
+
".csv", # Comma-Separated Values
|
127 |
+
".json", # JavaScript Object Notation
|
128 |
+
".xml", # eXtensible Markup Language
|
129 |
+
".yaml", # YAML Ain't Markup Language
|
130 |
+
".yml", # YAML
|
131 |
+
".log", # Log files
|
132 |
+
".conf", # Configuration files
|
133 |
+
".ini", # Initialization files
|
134 |
+
".properties", # Java properties files
|
135 |
+
".sql", # SQL scripts
|
136 |
+
".bat", # Batch files
|
137 |
+
".sh", # Shell scripts
|
138 |
+
".c", # C source code
|
139 |
+
".cpp", # C++ source code
|
140 |
+
".py", # Python source code
|
141 |
+
".java", # Java source code
|
142 |
+
".js", # JavaScript source code
|
143 |
+
".ts", # TypeScript source code
|
144 |
+
".swift", # Swift source code
|
145 |
+
".go", # Go source code
|
146 |
+
".rb", # Ruby source code
|
147 |
+
".php", # PHP source code
|
148 |
+
".css", # Cascading Style Sheets
|
149 |
+
".scss", # Sassy CSS
|
150 |
+
".less", # LESS CSS
|
151 |
),
|
152 |
):
|
153 |
self.input_dir = Path(input_dir)
|
|
|
201 |
|
202 |
# Process based on file type
|
203 |
match ext:
|
204 |
+
case (
|
205 |
+
".txt"
|
206 |
+
| ".md"
|
207 |
+
| ".html"
|
208 |
+
| ".htm"
|
209 |
+
| ".tex"
|
210 |
+
| ".json"
|
211 |
+
| ".xml"
|
212 |
+
| ".yaml"
|
213 |
+
| ".yml"
|
214 |
+
| ".rtf"
|
215 |
+
| ".odt"
|
216 |
+
| ".epub"
|
217 |
+
| ".csv"
|
218 |
+
| ".log"
|
219 |
+
| ".conf"
|
220 |
+
| ".ini"
|
221 |
+
| ".properties"
|
222 |
+
| ".sql"
|
223 |
+
| ".bat"
|
224 |
+
| ".sh"
|
225 |
+
| ".c"
|
226 |
+
| ".cpp"
|
227 |
+
| ".py"
|
228 |
+
| ".java"
|
229 |
+
| ".js"
|
230 |
+
| ".ts"
|
231 |
+
| ".swift"
|
232 |
+
| ".go"
|
233 |
+
| ".rb"
|
234 |
+
| ".php"
|
235 |
+
| ".css"
|
236 |
+
| ".scss"
|
237 |
+
| ".less"
|
238 |
+
):
|
239 |
content = file.decode("utf-8")
|
240 |
case ".pdf":
|
241 |
if not pm.is_installed("pypdf2"):
|