Upload st_data_parser.py
Browse files- st_data_parser.py +93 -0
st_data_parser.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。
|
3 |
+
1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。
|
4 |
+
|
5 |
+
"""
|
6 |
+
# -*- coding: utf-8 -*-
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
import re
|
10 |
+
from re import sub
|
11 |
+
import smtplib
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
from itertools import product
|
14 |
+
from tqdm import tqdm_notebook, tqdm, trange
|
15 |
+
import time
|
16 |
+
import pretty_errors
|
17 |
+
import seaborn as sns
|
18 |
+
from matplotlib.pyplot import style
|
19 |
+
from rich import print
|
20 |
+
import warnings
|
21 |
+
warnings.filterwarnings('ignore')
|
22 |
+
# style.use('seaborn')
|
23 |
+
import docx ## read docx file. from docx import Document
|
24 |
+
from docx import Document
|
25 |
+
import pandas as pd
|
26 |
+
import PyPDF2
|
27 |
+
|
28 |
+
### 解析文件,返回文件内容,包括pdf, docx, xlsx, csv, json等。
|
29 |
+
def parser(file):
|
30 |
+
file_content = ''
|
31 |
+
if '.pdf' in file.name:
|
32 |
+
print('PDF file detected')
|
33 |
+
# Add your PDF parsing code here
|
34 |
+
# pdf_file_obj = open(file, 'rb')
|
35 |
+
# pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
|
36 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
37 |
+
num_pages = len(pdf_reader.pages)
|
38 |
+
file_content = ''
|
39 |
+
for page in range(num_pages):
|
40 |
+
page_obj = pdf_reader.pages[page]
|
41 |
+
file_content += page_obj.extract_text()
|
42 |
+
# pdf_file_obj.close()
|
43 |
+
|
44 |
+
elif '.docx' in file.name:
|
45 |
+
print('Microsoft Word file detected')
|
46 |
+
doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。
|
47 |
+
file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs])
|
48 |
+
|
49 |
+
# elif '.xlsx' in file or '.xls' in file:
|
50 |
+
elif '.xlsx' in file or '.xls' in file.name:
|
51 |
+
print('Excel file detected')
|
52 |
+
df = pd.read_excel(file) ## 这里streamlit中的上传格式与普通格式一致。
|
53 |
+
file_content = df.to_string()
|
54 |
+
|
55 |
+
# elif '.csv' in file:
|
56 |
+
elif '.csv' in file.name:
|
57 |
+
print('CSV file detected')
|
58 |
+
# df = pd.read_csv(csv_file)
|
59 |
+
# file_content = df.to_string()
|
60 |
+
|
61 |
+
## streamlit中的获得上传文件的内容,与一般的情况不一样。
|
62 |
+
csv_file = file.getvalue().decode('utf-8')
|
63 |
+
file_content = csv_file
|
64 |
+
|
65 |
+
### streamlit中的获得上传文件的内容,与一般的情况不一样。
|
66 |
+
# elif '.json' in file:
|
67 |
+
elif '.json' in file.name:
|
68 |
+
# print('JSON file detected')
|
69 |
+
json_file = file.getvalue() ## 在streamlit中获得上传文件的json文件内容。这里不能用file.read().
|
70 |
+
json_file = json_file.decode('utf-8')
|
71 |
+
df = pd.read_json(json_file)
|
72 |
+
file_content = df.to_string()
|
73 |
+
# print('file_content:', file_content)
|
74 |
+
|
75 |
+
return file_content
|
76 |
+
|
77 |
+
# res_1 = parser('summary_qwen.csv')
|
78 |
+
# print(res_1) ## pass csv file
|
79 |
+
|
80 |
+
# res_2 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/text_mining/训练数据/13011800166202403111112051850.json')
|
81 |
+
# print(res_2) ## pass json file
|
82 |
+
|
83 |
+
# res_3 = parser('/Users/yunshi/Downloads/360Data/Data Center/Consulting Material/第二份资料/2 (H)/北大纵横/北大纵横2/北大纵横—-涟钢团ERP管理咨询项目组织结构设计与主业务流程设计报告/过程文件/涟钢资料/8)公司岗位设置/2002年新定员库2/生服公司.xls')
|
84 |
+
# print(res_3) ## pass xls file
|
85 |
+
|
86 |
+
# res_4 = parser('/Users/yunshi/Downloads/同步空间/LLM/2023ChatGPT/Coding/code_interpreter/rawdata/模拟数据.xlsx')
|
87 |
+
# print(res_4) ## pass xlsx file
|
88 |
+
|
89 |
+
# res_5 = parser('/Users/yunshi/Downloads/360Data/Data Center/Business Force/Project/中移在线/客户资料/201806 广州中心基线评审申报资料/各类附件/1.4 信息系统规划-1.docx')
|
90 |
+
# print(res_5) ## pass docx file
|
91 |
+
|
92 |
+
# res_6 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/gradio/中交建/产品演示DEMO/在线国产大模型演示与测试站点.pdf')
|
93 |
+
# print(res_6) ## pass docx file
|