Spaces:

AITextDetect
/

MGTbenchmark

Sleeping

App Files Files Community

Evan73 commited on Oct 31, 2024

Commit

fcb6ffd

1 Parent(s): 9a80e8e

Add app.py

Browse files

Files changed (1) hide show

app.py +83 -28

app.py CHANGED Viewed

@@ -1,32 +1,87 @@
 import streamlit as st
 import json
 import zipfile
-import os
-def process_json(json_file):
-    data = json.load(json_file)
-    # 在这里处理 JSON 数据
-    return f"Processed JSON file: {json_file.name}"
-def process_zip(zip_file):
-    results = []
-    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
-        zip_ref.extractall("temp")
-        for root, _, files in os.walk("temp"):
-            for filename in files:
-                if filename.endswith('.json'):
-                    with open(os.path.join(root, filename)) as f:
-                        data = json.load(f)
-                        # 在这里处理解压后的 JSON 文件
-                        results.append(f"Processed JSON file from ZIP: {filename}")
-    return results
-st.title("JSON and ZIP File Processor")
-uploaded_files = st.file_uploader("Upload JSON or ZIP files", type=["json", "zip"], accept_multiple_files=True)
-if uploaded_files:
-    for file in uploaded_files:
-        if file.name.endswith(".json"):
-            st.write(process_json(file))
-        elif file.name.endswith(".zip"):
-            st.write(process_zip(file))

 import streamlit as st
+import os
 import json
+import re
+import datasets
+import tiktoken
 import zipfile
+from pathlib import Path
+# 定义 tiktoken 编码器
+encoding = tiktoken.get_encoding("cl100k_base")
+# MGTHuman 类
+class MGTHuman(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("1.0.0")
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="human", version=VERSION, description="This part of human data"),
+        datasets.BuilderConfig(name="Moonshot", version=VERSION, description="Data from the Moonshot model"),
+        datasets.BuilderConfig(name="gpt35", version=VERSION, description="Data from the gpt-3.5-turbo model"),
+        datasets.BuilderConfig(name="Llama3", version=VERSION, description="Data from the Llama3 model"),
+        datasets.BuilderConfig(name="Mixtral", version=VERSION, description="Data from the Mixtral model"),
+        datasets.BuilderConfig(name="Qwen", version=VERSION, description="Data from the Qwen model"),
+    ]
+    DEFAULT_CONFIG_NAME = "human"
+    def truncate_text(self, text, max_tokens=2048):
+        tokens = encoding.encode(text, allowed_special={'<|endoftext|>'})
+        if len(tokens) > max_tokens:
+            tokens = tokens[:max_tokens]
+            truncated_text = encoding.decode(tokens)
+            last_period_idx = truncated_text.rfind('。')
+            if last_period_idx == -1:
+                last_period_idx = truncated_text.rfind('.')
+            if last_period_idx != -1:
+                truncated_text = truncated_text[:last_period_idx + 1]
+            return truncated_text
+        else:
+            return text
+    def get_text_by_index(self, filepath, index):
+        count = 0
+        for file in filepath:
+            with open(file, 'r') as f:
+                data = json.load(f)
+            for row in data:
+                if not row["text"].strip():
+                    continue
+                if count == index:
+                    text = self.truncate_text(row["text"], max_tokens=2048)
+                    return text
+                count += 1
+        return "Index 超出范围，请输入有效的数字。"
+# Streamlit UI
+st.title("MGTHuman Dataset Viewer")
+# 文件夹上传
+uploaded_folder = st.file_uploader("上传包含 JSON 文件的 ZIP 文件夹", type=["zip"])
+if uploaded_folder:
+    folder_path = Path("temp")
+    folder_path.mkdir(exist_ok=True)
+    zip_path = folder_path / uploaded_folder.name
+    with open(zip_path, "wb") as f:
+        f.write(uploaded_folder.getbuffer())
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(folder_path)
+    # 获取解压后的所有 JSON 文件路径
+    json_files = list(folder_path.glob("*.json"))
+    # 选择数据配置
+    config_name = st.selectbox("选择数据配置", ["human", "Moonshot", "gpt35", "Llama3", "Mixtral", "Qwen"])
+    mgt_human = MGTHuman(name=config_name)
+    # 输入序号查看文本
+    index_to_view = st.number_input("输入要查看的文本序号", min_value=0, step=1)
+    if st.button("显示文本"):
+        text = mgt_human.get_text_by_index(json_files, index=index_to_view)
+        st.write("对应的文本内容为：", text)
+# 清理上传文件的临时目录
+if st.button("清除文件"):
+    import shutil
+    shutil.rmtree("temp")
+    st.write("临时文件已清除。")