Spaces:
Sleeping
Sleeping
| """ | |
| 1. 完成了多个文件类型的解析,包括pdf, docx, xlsx, csv, json等。 | |
| 1. csv,json, xls, xlxs, docx, pdf文件,直接读取文件内容。 | |
| """ | |
| # -*- coding: utf-8 -*- | |
| import numpy as np | |
| import pandas as pd | |
| import re | |
| from re import sub | |
| import smtplib | |
| import matplotlib.pyplot as plt | |
| from itertools import product | |
| from tqdm import tqdm_notebook, tqdm, trange | |
| import time | |
| import pretty_errors | |
| import seaborn as sns | |
| from matplotlib.pyplot import style | |
| from rich import print | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # style.use('seaborn') | |
| import docx ## read docx file. from docx import Document | |
| from docx import Document | |
| import pandas as pd | |
| import PyPDF2 | |
| ### 解析文件,返回文件内容,包括pdf, docx, xlsx, csv, json等。 | |
| def parser(file): | |
| file_content = '' | |
| if '.pdf' in file.name: | |
| print('PDF file detected') | |
| # Add your PDF parsing code here | |
| # pdf_file_obj = open(file, 'rb') | |
| # pdf_reader = PyPDF2.PdfReader(pdf_file_obj) | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| num_pages = len(pdf_reader.pages) | |
| file_content = '' | |
| for page in range(num_pages): | |
| page_obj = pdf_reader.pages[page] | |
| file_content += page_obj.extract_text() | |
| # pdf_file_obj.close() | |
| elif '.docx' in file.name: | |
| print('Microsoft Word file detected') | |
| doc = Document(file) ## 这里streamlit中的上传格式与普通格式一致。 | |
| file_content = ' '.join([paragraph.text for paragraph in doc.paragraphs]) | |
| # elif '.xlsx' in file or '.xls' in file: | |
| elif '.xlsx' in file or '.xls' in file.name: | |
| print('Excel file detected') | |
| df = pd.read_excel(file) ## 这里streamlit中的上传格式与普通格式一致。 | |
| file_content = df.to_string() | |
| # elif '.csv' in file: | |
| elif '.csv' in file.name: | |
| print('CSV file detected') | |
| # df = pd.read_csv(csv_file) | |
| # file_content = df.to_string() | |
| ## streamlit中的获得上传文件的内容,与一般的情况不一样。 | |
| csv_file = file.getvalue().decode('utf-8') | |
| file_content = csv_file | |
| ### streamlit中的获得上传文件的内容,与一般的情况不一样。 | |
| # elif '.json' in file: | |
| elif '.json' in file.name: | |
| # print('JSON file detected') | |
| json_file = file.getvalue() ## 在streamlit中获得上传文件的json文件内容。这里不能用file.read(). | |
| json_file = json_file.decode('utf-8') | |
| df = pd.read_json(json_file) | |
| file_content = df.to_string() | |
| # print('file_content:', file_content) | |
| return file_content | |
| # res_1 = parser('summary_qwen.csv') | |
| # print(res_1) ## pass csv file | |
| # res_2 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/text_mining/训练数据/13011800166202403111112051850.json') | |
| # print(res_2) ## pass json file | |
| # res_3 = parser('/Users/yunshi/Downloads/360Data/Data Center/Consulting Material/第二份资料/2 (H)/北大纵横/北大纵横2/北大纵横—-涟钢团ERP管理咨询项目组织结构设计与主业务流程设计报告/过程文件/涟钢资料/8)公司岗位设置/2002年新定员库2/生服公司.xls') | |
| # print(res_3) ## pass xls file | |
| # res_4 = parser('/Users/yunshi/Downloads/同步空间/LLM/2023ChatGPT/Coding/code_interpreter/rawdata/模拟数据.xlsx') | |
| # print(res_4) ## pass xlsx file | |
| # res_5 = parser('/Users/yunshi/Downloads/360Data/Data Center/Business Force/Project/中移在线/客户资料/201806 广州中心基线评审申报资料/各类附件/1.4 信息系统规划-1.docx') | |
| # print(res_5) ## pass docx file | |
| # res_6 = parser('/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/gradio/中交建/产品演示DEMO/在线国产大模型演示与测试站点.pdf') | |
| # print(res_6) ## pass docx file |