Spaces:
Runtime error
Runtime error
File size: 5,489 Bytes
a2f42ca f807e7d a2f42ca f807e7d a2f42ca 26f62c4 f807e7d 26f62c4 f807e7d 26f62c4 c88c1d9 a2f42ca c88c1d9 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca 26f62c4 a2f42ca c88c1d9 26f62c4 a2f42ca 26f62c4 a2f42ca f807e7d 26f62c4 c88c1d9 a2f42ca c88c1d9 a2f42ca c88c1d9 a2f42ca 26f62c4 66b707b 26f62c4 c88c1d9 26f62c4 66b707b a2f42ca 26f62c4 a2f42ca f807e7d a2f42ca 66b707b 26f62c4 c88c1d9 26f62c4 66b707b f807e7d a2f42ca f807e7d a2f42ca f807e7d 26f62c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import os
import json
import logging
import hashlib
import pandas as pd
from .gpt_processor import (
EmbeddingGenerator,
KeywordsGenerator,
Summarizer,
TopicsGenerator,
Translator,
)
from .pdf_processor import PDFProcessor
processors = {
"pdf": PDFProcessor,
}
class WorkFlowController:
def __init__(self, file_src, uid) -> None:
# check if the file_path is list
# self.file_paths = self.__get_file_name(file_src)
self.file_paths = [x.name for x in file_src]
self.uid = uid
print(self.file_paths)
self.files_info = {}
for file_path in self.file_paths:
file_name = file_path.split("/")[-1]
file_format = file_path.split(".")[-1]
self.file_processor = processors[file_format]
file = self.file_processor(file_path).file_info
file = self.__process_file(file)
self.files_info[file_name] = file
self.__dump_to_json()
self.__dump_to_csv()
def __get_summary(self, file: dict):
# get summary from file content
summarizer = Summarizer()
file["summarized_content"] = summarizer.summarize(file["file_full_content"])
return file
def __get_keywords(self, file: dict):
# get keywords from file content
keywords_generator = KeywordsGenerator()
file["keywords"] = keywords_generator.extract_keywords(
file["file_full_content"]
)
return file
def __get_topics(self, file: dict):
# get topics from file content
topics_generator = TopicsGenerator()
file["topics"] = topics_generator.extract_topics(file["file_full_content"])
return file
def __get_embedding(self, file):
# get embedding from file content
# return embedding
embedding_generator = EmbeddingGenerator()
for i, _ in enumerate(file["file_content"]):
# use i+1 to meet the index of file_content
file["file_content"][i + 1][
"page_embedding"
] = embedding_generator.get_embedding(
file["file_content"][i + 1]["page_content"]
)
return file
def __translate_to_chinese(self, file: dict):
# translate file content to chinese
translator = Translator()
# reset the file full content
file["file_full_content"] = ""
for i, _ in enumerate(file["file_content"]):
# use i+1 to meet the index of file_content
print("Translating page: " + str(i + 1))
file["file_content"][i + 1][
"page_content"
] = translator.translate_to_chinese(
file["file_content"][i + 1]["page_content"]
)
file["file_full_content"] = (
file["file_full_content"] + file["file_content"][i + 1]["page_content"]
)
return file
def __process_file(self, file: dict):
# process file content
# return processed data
if not file["is_chinese"]:
print("Translating to chinese...")
file = self.__translate_to_chinese(file)
print("Getting embedding...")
file = self.__get_embedding(file)
print("Getting summary...")
file = self.__get_summary(file)
return file
def __dump_to_json(self):
with open(
os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json"),
"w",
encoding="utf-8",
) as f:
print(
"Dumping to json, the path is: "
+ os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json")
)
self.json_result_path = os.path.join(
os.getcwd(), f"{self.uid}_knowledge_base.json"
)
json.dump(self.files_info, f, indent=4, ensure_ascii=False)
def __construct_knowledge_base_dataframe(self):
rows = []
for file_path, content in self.files_info.items():
for page_num, page_details in content["file_content"].items():
row = {
"file_name": content["file_name"],
"page_num": page_details["page_num"],
"page_content": page_details["page_content"],
"page_embedding": page_details["page_embedding"],
}
rows.append(row)
columns = [
"file_name",
"page_num",
"page_content",
"page_embedding",
]
df = pd.DataFrame(rows, columns=columns)
return df
def __dump_to_csv(self):
df = self.__construct_knowledge_base_dataframe()
df.to_csv(
os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv"), index=False
)
print(
"Dumping to csv, the path is: "
+ os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv")
)
self.csv_result_path = os.path.join(
os.getcwd(), f"{self.uid}_knowledge_base.csv"
)
def __get_file_name(self, file_src):
file_paths = [x.name for x in file_src]
file_paths.sort(key=lambda x: os.path.basename(x))
md5_hash = hashlib.md5()
for file_path in file_paths:
with open(file_path, "rb") as f:
while chunk := f.read(8192):
md5_hash.update(chunk)
return md5_hash.hexdigest()
|