cutechicken commited on
Commit
fcd720a
โ€ข
1 Parent(s): d6a3ccb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -119
app.py CHANGED
@@ -128,6 +128,9 @@ def find_relevant_context(query, top_k=3):
128
 
129
  return relevant_contexts
130
 
 
 
 
131
  def analyze_file_content(content, file_type):
132
  """Analyze file content and return structural summary"""
133
  if file_type in ['parquet', 'csv']:
@@ -136,9 +139,9 @@ def analyze_file_content(content, file_type):
136
  header = lines[0]
137
  columns = header.count('|') - 1
138
  rows = len(lines) - 3
139
- return f"๐Ÿ“Š ๋ฐ์ดํ„ฐ์…‹ ๊ตฌ์กฐ: {columns}๊ฐœ ์ปฌ๋Ÿผ, {rows}๊ฐœ ๋ฐ์ดํ„ฐ"
140
  except:
141
- return "โŒ ๋ฐ์ดํ„ฐ์…‹ ๊ตฌ์กฐ ๋ถ„์„ ์‹คํŒจ"
142
 
143
  lines = content.split('\n')
144
  total_lines = len(lines)
@@ -148,51 +151,11 @@ def analyze_file_content(content, file_type):
148
  functions = len([line for line in lines if 'def ' in line])
149
  classes = len([line for line in lines if 'class ' in line])
150
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
151
- return f"๐Ÿ’ป ์ฝ”๋“œ ๊ตฌ์กฐ: {total_lines}์ค„ (ํ•จ์ˆ˜: {functions}, ํด๋ž˜์Šค: {classes}, ์ž„ํฌํŠธ: {imports})"
152
 
153
  paragraphs = content.count('\n\n') + 1
154
  words = len(content.split())
155
- return f"๐Ÿ“ ๋ฌธ์„œ ๊ตฌ์กฐ: {total_lines}์ค„, {paragraphs}๋‹จ๋ฝ, ์•ฝ {words}๋‹จ์–ด"
156
-
157
-
158
- def extract_pdf_text_with_ocr(file_path):
159
- try:
160
- # Poppler ๊ฒฝ๋กœ ์„ค์ •
161
- if platform.system() == 'Windows':
162
- poppler_path = r"C:\Program Files\poppler-0.68.0\bin"
163
- else:
164
- poppler_path = None # Linux์˜ ๊ฒฝ์šฐ ๊ธฐ๋ณธ ๊ฒฝ๋กœ ์‚ฌ์šฉ
165
-
166
- # PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
167
- images = convert_from_path(
168
- file_path,
169
- poppler_path=poppler_path,
170
- fmt='jpeg',
171
- grayscale=False,
172
- size=(1700, None) # ํ•ด์ƒ๋„ ํ–ฅ์ƒ
173
- )
174
-
175
- # ์ „์ฒด ํ…์ŠคํŠธ ์ €์žฅ
176
- text = ""
177
-
178
- # ๊ฐ ํŽ˜์ด์ง€์— ๋Œ€ํ•ด OCR ์ˆ˜ํ–‰
179
- for i, image in enumerate(images):
180
- try:
181
- # OCR ์„ค์ •
182
- custom_config = r'--oem 3 --psm 6 -l kor+eng'
183
- # OCR ์ˆ˜ํ–‰
184
- page_text = pytesseract.image_to_string(
185
- image,
186
- config=custom_config
187
- )
188
- text += f"\n--- ํŽ˜์ด์ง€ {i+1} ---\n{page_text}\n"
189
- except Exception as e:
190
- print(f"ํŽ˜์ด์ง€ {i+1} OCR ์˜ค๋ฅ˜: {str(e)}")
191
- continue
192
-
193
- return text
194
- except Exception as e:
195
- return f"PDF ํ…์ŠคํŠธ ์ถ”์ถœ ์˜ค๋ฅ˜: {str(e)}"
196
 
197
  def read_uploaded_file(file):
198
  if file is None:
@@ -200,62 +163,56 @@ def read_uploaded_file(file):
200
  try:
201
  file_ext = os.path.splitext(file.name)[1].lower()
202
 
203
-
204
-
205
- # Parquet ํŒŒ์ผ ์ฒ˜๋ฆฌ
206
  if file_ext == '.parquet':
207
  try:
208
  table = pq.read_table(file.name)
209
  df = table.to_pandas()
210
 
211
- content = f"๐Ÿ“Š Parquet ํŒŒ์ผ ๋ถ„์„:\n\n"
212
- content += f"1. ๊ธฐ๋ณธ ์ •๋ณด:\n"
213
- content += f"- ์ „์ฒด ํ–‰ ์ˆ˜: {len(df):,}๊ฐœ\n"
214
- content += f"- ์ „์ฒด ์—ด ์ˆ˜: {len(df.columns)}๊ฐœ\n"
215
- content += f"- ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
216
 
217
- content += f"2. ์ปฌ๋Ÿผ ์ •๋ณด:\n"
218
  for col in df.columns:
219
  content += f"- {col} ({df[col].dtype})\n"
220
 
221
- content += f"\n3. ๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:\n"
222
- # tabulate ์‚ฌ์šฉํ•˜์—ฌ ํ…Œ์ด๋ธ” ํ˜•์‹์œผ๋กœ ์ถœ๋ ฅ
223
  content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False)
224
 
225
- content += f"\n\n4. ๊ฒฐ์ธก์น˜ ์ •๋ณด:\n"
226
  null_counts = df.isnull().sum()
227
  for col, count in null_counts[null_counts > 0].items():
228
- content += f"- {col}: {count:,}๊ฐœ ({count/len(df)*100:.1f}%)\n"
229
 
230
- # ์ˆ˜์น˜ํ˜• ์ปฌ๋Ÿผ์— ๋Œ€ํ•œ ๊ธฐ๋ณธ ํ†ต๊ณ„
231
  numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
232
  if len(numeric_cols) > 0:
233
- content += f"\n5. ์ˆ˜์น˜ํ˜• ์ปฌ๋Ÿผ ํ†ต๊ณ„:\n"
234
  stats_df = df[numeric_cols].describe()
235
  content += tabulate(stats_df, headers='keys', tablefmt='pipe')
236
 
237
  return content, "parquet"
238
  except Exception as e:
239
- return f"Parquet ํŒŒ์ผ ์ฝ๊ธฐ ์˜ค๋ฅ˜: {str(e)}", "error"
240
 
241
- # PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ
242
  if file_ext == '.pdf':
243
  try:
244
  pdf_reader = pypdf.PdfReader(file.name)
245
  total_pages = len(pdf_reader.pages)
246
 
247
- content = f"๐Ÿ“‘ PDF ๋ฌธ์„œ ๋ถ„์„:\n\n"
248
- content += f"1. ๊ธฐ๋ณธ ์ •๋ณด:\n"
249
- content += f"- ์ด ํŽ˜์ด์ง€ ์ˆ˜: {total_pages}ํŽ˜์ด์ง€\n"
250
 
251
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ
252
  if pdf_reader.metadata:
253
- content += "\n2. ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ:\n"
254
  for key, value in pdf_reader.metadata.items():
255
  if value and str(key).startswith('/'):
256
  content += f"- {key[1:]}: {value}\n"
257
 
258
- # ๋จผ์ € pdfminer๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹œ๋„
259
  try:
260
  text = extract_text(
261
  file.name,
@@ -269,117 +226,101 @@ def read_uploaded_file(file):
269
  except:
270
  text = ""
271
 
272
- # pdfminer๋กœ ์ถ”์ถœ ์‹คํŒจ์‹œ OCR ์‹œ๋„
273
  if not text.strip():
274
  text = extract_pdf_text_with_ocr(file.name)
275
 
276
- # ํ…์ŠคํŠธ ๋ถ„์„
277
  if text:
278
  words = text.split()
279
  lines = text.split('\n')
280
- content += f"\n3. ํ…์ŠคํŠธ ๋ถ„์„:\n"
281
- content += f"- ์ด ๋‹จ์–ด ์ˆ˜: {len(words):,}๊ฐœ\n"
282
- content += f"- ๊ณ ์œ  ๋‹จ์–ด ์ˆ˜: {len(set(words)):,}๊ฐœ\n"
283
- content += f"- ์ด ๋ผ์ธ ์ˆ˜: {len(lines):,}๊ฐœ\n"
284
 
285
- # ๋ณธ๋ฌธ ๋‚ด์šฉ
286
- content += f"\n4. ๋ณธ๋ฌธ ๋‚ด์šฉ:\n"
287
- preview_length = min(2000, len(text)) # ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๊ธธ์ด ์ฆ๊ฐ€
288
- content += f"--- ์ฒ˜์Œ {preview_length}์ž ---\n"
289
  content += text[:preview_length]
290
  if len(text) > preview_length:
291
- content += f"\n... (์ด {len(text):,}์ž ์ค‘ ์ผ๋ถ€ ํ‘œ์‹œ)\n"
292
  else:
293
- content += "\nโš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ"
294
 
295
  return content, "pdf"
296
  except Exception as e:
297
- return f"PDF ํŒŒ์ผ ์ฝ๊ธฐ ์˜ค๋ฅ˜: {str(e)}", "error"
298
-
299
-
300
 
301
- # CSV ํŒŒ์ผ ์ฒ˜๋ฆฌ
302
  elif file_ext == '.csv':
303
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
304
  for encoding in encodings:
305
  try:
306
  df = pd.read_csv(file.name, encoding=encoding)
307
- content = f"๐Ÿ“Š CSV ํŒŒ์ผ ๋ถ„์„:\n\n"
308
- content += f"1. ๊ธฐ๋ณธ ์ •๋ณด:\n"
309
- content += f"- ์ „์ฒด ํ–‰ ์ˆ˜: {len(df):,}๊ฐœ\n"
310
- content += f"- ์ „์ฒด ์—ด ์ˆ˜: {len(df.columns)}๊ฐœ\n"
311
- content += f"- ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
312
 
313
- content += f"2. ์ปฌ๋Ÿผ ์ •๋ณด:\n"
314
  for col in df.columns:
315
  content += f"- {col} ({df[col].dtype})\n"
316
 
317
- content += f"\n3. ๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:\n"
318
  content += df.head(5).to_markdown(index=False)
319
 
320
- content += f"\n\n4. ๊ฒฐ์ธก์น˜ ์ •๋ณด:\n"
321
  null_counts = df.isnull().sum()
322
  for col, count in null_counts[null_counts > 0].items():
323
- content += f"- {col}: {count:,}๊ฐœ ({count/len(df)*100:.1f}%)\n"
324
 
325
  return content, "csv"
326
  except UnicodeDecodeError:
327
  continue
328
- raise UnicodeDecodeError(f"์ง€์›๋˜๋Š” ์ธ์ฝ”๋”ฉ์œผ๋กœ ํŒŒ์ผ์„ ์ฝ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค ({', '.join(encodings)})")
329
 
330
- # ํ…์ŠคํŠธ ํŒŒ์ผ ์ฒ˜๋ฆฌ
331
  else:
332
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
333
  for encoding in encodings:
334
  try:
335
  with open(file.name, 'r', encoding=encoding) as f:
336
  content = f.read()
337
-
338
- # ํŒŒ์ผ ๋‚ด์šฉ ๋ถ„์„
339
  lines = content.split('\n')
340
  total_lines = len(lines)
341
  non_empty_lines = len([line for line in lines if line.strip()])
342
 
343
- # ์ฝ”๋“œ ํŒŒ์ผ ์—ฌ๋ถ€ ํ™•์ธ
344
  is_code = any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function'])
345
 
346
- analysis = f"\n๐Ÿ“ ํŒŒ์ผ ๋ถ„์„:\n"
347
  if is_code:
348
- # ์ฝ”๋“œ ํŒŒ์ผ ๋ถ„์„
349
  functions = len([line for line in lines if 'def ' in line])
350
  classes = len([line for line in lines if 'class ' in line])
351
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
352
 
353
- analysis += f"- ํŒŒ์ผ ์œ ํ˜•: ์ฝ”๋“œ\n"
354
- analysis += f"- ์ „์ฒด ๋ผ์ธ ์ˆ˜: {total_lines:,}์ค„\n"
355
- analysis += f"- ํ•จ์ˆ˜ ์ˆ˜: {functions}๊ฐœ\n"
356
- analysis += f"- ํด๋ž˜์Šค ์ˆ˜: {classes}๊ฐœ\n"
357
- analysis += f"- import ๋ฌธ ์ˆ˜: {imports}๊ฐœ\n"
358
  else:
359
- # ์ผ๋ฐ˜ ํ…์ŠคํŠธ ํŒŒ์ผ ๋ถ„์„
360
  words = len(content.split())
361
  chars = len(content)
362
 
363
- analysis += f"- ํŒŒ์ผ ์œ ํ˜•: ํ…์ŠคํŠธ\n"
364
- analysis += f"- ์ „์ฒด ๋ผ์ธ ์ˆ˜: {total_lines:,}์ค„\n"
365
- analysis += f"- ์‹ค์ œ ๋‚ด์šฉ์ด ์žˆ๋Š” ๋ผ์ธ ์ˆ˜: {non_empty_lines:,}์ค„\n"
366
- analysis += f"- ๋‹จ์–ด ์ˆ˜: {words:,}๊ฐœ\n"
367
- analysis += f"- ๋ฌธ์ž ์ˆ˜: {chars:,}๊ฐœ\n"
368
 
369
  return content + analysis, "text"
370
  except UnicodeDecodeError:
371
  continue
372
- raise UnicodeDecodeError(f"์ง€์›๋˜๋Š” ์ธ์ฝ”๋”ฉ์œผ๋กœ ํŒŒ์ผ์„ ์ฝ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค ({', '.join(encodings)})")
373
 
374
  except Exception as e:
375
- return f"ํŒŒ์ผ ์ฝ๊ธฐ ์˜ค๋ฅ˜: {str(e)}", "error"
376
-
377
-
378
-
379
-
380
- # ํŒŒ์ผ ์—…๋กœ๋“œ ์ด๋ฒคํŠธ ํ•ธ๋“ค๋ง ์ˆ˜์ •
381
- def init_msg():
382
- return "ํŒŒ์ผ์„ ๋ถ„์„ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค..."
383
 
384
 
385
  CSS = """
 
128
 
129
  return relevant_contexts
130
 
131
+ def init_msg():
132
+ return "Analyzing file..."
133
+
134
  def analyze_file_content(content, file_type):
135
  """Analyze file content and return structural summary"""
136
  if file_type in ['parquet', 'csv']:
 
139
  header = lines[0]
140
  columns = header.count('|') - 1
141
  rows = len(lines) - 3
142
+ return f"๐Ÿ“Š Dataset Structure: {columns} columns, {rows} rows"
143
  except:
144
+ return "โŒ Failed to analyze dataset structure"
145
 
146
  lines = content.split('\n')
147
  total_lines = len(lines)
 
151
  functions = len([line for line in lines if 'def ' in line])
152
  classes = len([line for line in lines if 'class ' in line])
153
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
154
+ return f"๐Ÿ’ป Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"
155
 
156
  paragraphs = content.count('\n\n') + 1
157
  words = len(content.split())
158
+ return f"๐Ÿ“ Document Structure: {total_lines} lines, {paragraphs} paragraphs, approximately {words} words"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  def read_uploaded_file(file):
161
  if file is None:
 
163
  try:
164
  file_ext = os.path.splitext(file.name)[1].lower()
165
 
166
+ # Parquet file processing
 
 
167
  if file_ext == '.parquet':
168
  try:
169
  table = pq.read_table(file.name)
170
  df = table.to_pandas()
171
 
172
+ content = f"๐Ÿ“Š Parquet File Analysis:\n\n"
173
+ content += f"1. Basic Information:\n"
174
+ content += f"- Total Rows: {len(df):,}\n"
175
+ content += f"- Total Columns: {len(df.columns)}\n"
176
+ content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
177
 
178
+ content += f"2. Column Information:\n"
179
  for col in df.columns:
180
  content += f"- {col} ({df[col].dtype})\n"
181
 
182
+ content += f"\n3. Data Preview:\n"
 
183
  content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False)
184
 
185
+ content += f"\n\n4. Missing Values:\n"
186
  null_counts = df.isnull().sum()
187
  for col, count in null_counts[null_counts > 0].items():
188
+ content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
189
 
 
190
  numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
191
  if len(numeric_cols) > 0:
192
+ content += f"\n5. Numeric Column Statistics:\n"
193
  stats_df = df[numeric_cols].describe()
194
  content += tabulate(stats_df, headers='keys', tablefmt='pipe')
195
 
196
  return content, "parquet"
197
  except Exception as e:
198
+ return f"Error reading Parquet file: {str(e)}", "error"
199
 
200
+ # PDF file processing
201
  if file_ext == '.pdf':
202
  try:
203
  pdf_reader = pypdf.PdfReader(file.name)
204
  total_pages = len(pdf_reader.pages)
205
 
206
+ content = f"๐Ÿ“‘ PDF Document Analysis:\n\n"
207
+ content += f"1. Basic Information:\n"
208
+ content += f"- Total Pages: {total_pages}\n"
209
 
 
210
  if pdf_reader.metadata:
211
+ content += "\n2. Metadata:\n"
212
  for key, value in pdf_reader.metadata.items():
213
  if value and str(key).startswith('/'):
214
  content += f"- {key[1:]}: {value}\n"
215
 
 
216
  try:
217
  text = extract_text(
218
  file.name,
 
226
  except:
227
  text = ""
228
 
 
229
  if not text.strip():
230
  text = extract_pdf_text_with_ocr(file.name)
231
 
 
232
  if text:
233
  words = text.split()
234
  lines = text.split('\n')
235
+ content += f"\n3. Text Analysis:\n"
236
+ content += f"- Total Words: {len(words):,}\n"
237
+ content += f"- Unique Words: {len(set(words)):,}\n"
238
+ content += f"- Total Lines: {len(lines):,}\n"
239
 
240
+ content += f"\n4. Content Preview:\n"
241
+ preview_length = min(2000, len(text))
242
+ content += f"--- First {preview_length} characters ---\n"
 
243
  content += text[:preview_length]
244
  if len(text) > preview_length:
245
+ content += f"\n... (Showing partial content of {len(text):,} characters)\n"
246
  else:
247
+ content += "\nโš ๏ธ Text extraction failed"
248
 
249
  return content, "pdf"
250
  except Exception as e:
251
+ return f"Error reading PDF file: {str(e)}", "error"
 
 
252
 
253
+ # CSV file processing
254
  elif file_ext == '.csv':
255
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
256
  for encoding in encodings:
257
  try:
258
  df = pd.read_csv(file.name, encoding=encoding)
259
+ content = f"๐Ÿ“Š CSV File Analysis:\n\n"
260
+ content += f"1. Basic Information:\n"
261
+ content += f"- Total Rows: {len(df):,}\n"
262
+ content += f"- Total Columns: {len(df.columns)}\n"
263
+ content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n"
264
 
265
+ content += f"2. Column Information:\n"
266
  for col in df.columns:
267
  content += f"- {col} ({df[col].dtype})\n"
268
 
269
+ content += f"\n3. Data Preview:\n"
270
  content += df.head(5).to_markdown(index=False)
271
 
272
+ content += f"\n\n4. Missing Values:\n"
273
  null_counts = df.isnull().sum()
274
  for col, count in null_counts[null_counts > 0].items():
275
+ content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n"
276
 
277
  return content, "csv"
278
  except UnicodeDecodeError:
279
  continue
280
+ raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})")
281
 
282
+ # Text file processing
283
  else:
284
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
285
  for encoding in encodings:
286
  try:
287
  with open(file.name, 'r', encoding=encoding) as f:
288
  content = f.read()
289
+
 
290
  lines = content.split('\n')
291
  total_lines = len(lines)
292
  non_empty_lines = len([line for line in lines if line.strip()])
293
 
 
294
  is_code = any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function'])
295
 
296
+ analysis = f"\n๐Ÿ“ File Analysis:\n"
297
  if is_code:
 
298
  functions = len([line for line in lines if 'def ' in line])
299
  classes = len([line for line in lines if 'class ' in line])
300
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
301
 
302
+ analysis += f"- File Type: Code\n"
303
+ analysis += f"- Total Lines: {total_lines:,}\n"
304
+ analysis += f"- Functions: {functions}\n"
305
+ analysis += f"- Classes: {classes}\n"
306
+ analysis += f"- Import Statements: {imports}\n"
307
  else:
 
308
  words = len(content.split())
309
  chars = len(content)
310
 
311
+ analysis += f"- File Type: Text\n"
312
+ analysis += f"- Total Lines: {total_lines:,}\n"
313
+ analysis += f"- Non-empty Lines: {non_empty_lines:,}\n"
314
+ analysis += f"- Word Count: {words:,}\n"
315
+ analysis += f"- Character Count: {chars:,}\n"
316
 
317
  return content + analysis, "text"
318
  except UnicodeDecodeError:
319
  continue
320
+ raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})")
321
 
322
  except Exception as e:
323
+ return f"Error reading file: {str(e)}", "error"
 
 
 
 
 
 
 
324
 
325
 
326
  CSS = """