Spaces:

chenzihong
/

GraphGen

Sleeping

App Files Files Community

github-actions[bot] commited on Sep 24

Commit

43d27f2

1 Parent(s): d2a63cc

Auto-sync from demo at Wed Sep 24 09:52:41 UTC 2025

Browse files

Files changed (26) hide show

app.py +27 -12
graphgen/bases/base_splitter.py +135 -0
graphgen/bases/datatypes.py +18 -0
graphgen/{models/text → configs}/__init__.py +0 -0
graphgen/configs/aggregated_config.yaml +5 -1
graphgen/configs/atomic_config.yaml +5 -1
graphgen/configs/cot_config.yaml +5 -1
graphgen/configs/multi_hop_config.yaml +5 -1
graphgen/evaluate.py +82 -52
graphgen/graphgen.py +17 -12
graphgen/models/__init__.py +1 -30
graphgen/models/evaluate/base_evaluator.py +9 -7
graphgen/models/evaluate/length_evaluator.py +5 -5
graphgen/models/evaluate/mtld_evaluator.py +13 -8
graphgen/models/evaluate/reward_evaluator.py +13 -7
graphgen/models/evaluate/uni_evaluator.py +46 -22
graphgen/models/splitter/__init__.py +31 -0
graphgen/models/splitter/character_splitter.py +26 -0
graphgen/models/splitter/markdown_splitter.py +33 -0
graphgen/models/splitter/recursive_character_splitter.py +149 -0
graphgen/models/text/chunk.py +0 -7
graphgen/models/text/text_pair.py +0 -9
graphgen/operators/kg/extract_kg.py +2 -1
graphgen/operators/preprocess/resolute_coreference.py +2 -1
webui/app.py +27 -12
webui/base.py +2 -1

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from graphgen.graphgen import GraphGen
 from graphgen.models import OpenAIModel, Tokenizer
 from graphgen.models.llm.limitter import RPM, TPM
 from graphgen.utils import set_logger
-from webui.base import GraphGenParams
 from webui.cache_utils import cleanup_workspace, setup_workspace
 from webui.count_tokens import count_tokens
 from webui.i18n import Translate
@@ -66,13 +66,19 @@ def init_graph_gen(config: dict, env: dict) -> GraphGen:
 # pylint: disable=too-many-statements
-def run_graphgen(params, progress=gr.Progress()):
     def sum_tokens(client):
         return sum(u["total_tokens"] for u in client.token_usage)
     config = {
         "if_trainee_model": params.if_trainee_model,
-        "input_file": params.input_file,
         "output_data_type": params.output_data_type,
         "output_data_format": params.output_data_format,
         "tokenizer": params.tokenizer,
@@ -91,7 +97,6 @@ def run_graphgen(params, progress=gr.Progress()):
             "isolated_node_strategy": params.isolated_node_strategy,
             "loss_strategy": params.loss_strategy,
         },
-        "chunk_size": params.chunk_size,
     }
     env = {
@@ -284,10 +289,18 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
                 label="Chunk Size",
                 minimum=256,
                 maximum=4096,
-                value=512,
                 step=256,
                 interactive=True,
             )
             tokenizer = gr.Textbox(
                 label="Tokenizer", value="cl100k_base", interactive=True
             )
@@ -499,7 +512,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
         submit_btn.click(
             lambda *args: run_graphgen(
-                GraphGenParams(
                     if_trainee_model=args[0],
                     input_file=args[1],
                     tokenizer=args[2],
@@ -518,12 +531,13 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
                     trainee_model=args[15],
                     api_key=args[16],
                     chunk_size=args[17],
-                    rpm=args[18],
-                    tpm=args[19],
-                    quiz_samples=args[20],
-                    trainee_url=args[21],
-                    trainee_api_key=args[22],
-                    token_counter=args[23],
                 )
             ),
             inputs=[
@@ -545,6 +559,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
                 trainee_model,
                 api_key,
                 chunk_size,
                 rpm,
                 tpm,
                 quiz_samples,

 from graphgen.models import OpenAIModel, Tokenizer
 from graphgen.models.llm.limitter import RPM, TPM
 from graphgen.utils import set_logger
+from webui.base import WebuiParams
 from webui.cache_utils import cleanup_workspace, setup_workspace
 from webui.count_tokens import count_tokens
 from webui.i18n import Translate
 # pylint: disable=too-many-statements
+def run_graphgen(params: WebuiParams, progress=gr.Progress()):
     def sum_tokens(client):
         return sum(u["total_tokens"] for u in client.token_usage)
     config = {
         "if_trainee_model": params.if_trainee_model,
+        "read": {
+            "input_file": params.input_file,
+        },
+        "split": {
+            "chunk_size": params.chunk_size,
+            "chunk_overlap": params.chunk_overlap,
+        },
         "output_data_type": params.output_data_type,
         "output_data_format": params.output_data_format,
         "tokenizer": params.tokenizer,
             "isolated_node_strategy": params.isolated_node_strategy,
             "loss_strategy": params.loss_strategy,
         },
     }
     env = {
                 label="Chunk Size",
                 minimum=256,
                 maximum=4096,
+                value=1024,
                 step=256,
                 interactive=True,
             )
+            chunk_overlap = gr.Slider(
+                label="Chunk Overlap",
+                minimum=0,
+                maximum=500,
+                value=100,
+                step=100,
+                interactive=True,
+            )
             tokenizer = gr.Textbox(
                 label="Tokenizer", value="cl100k_base", interactive=True
             )
         submit_btn.click(
             lambda *args: run_graphgen(
+                WebuiParams(
                     if_trainee_model=args[0],
                     input_file=args[1],
                     tokenizer=args[2],
                     trainee_model=args[15],
                     api_key=args[16],
                     chunk_size=args[17],
+                    chunk_overlap=args[18],
+                    rpm=args[19],
+                    tpm=args[20],
+                    quiz_samples=args[21],
+                    trainee_url=args[22],
+                    trainee_api_key=args[23],
+                    token_counter=args[24],
                 )
             ),
             inputs=[
                 trainee_model,
                 api_key,
                 chunk_size,
+                chunk_overlap,
                 rpm,
                 tpm,
                 quiz_samples,

graphgen/bases/base_splitter.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import copy
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Iterable, List, Literal, Optional, Union
+from graphgen.bases.datatypes import Chunk
+from graphgen.utils import logger
+@dataclass
+class BaseSplitter(ABC):
+    """
+    Abstract base class for splitting text into smaller chunks.
+    """
+    chunk_size: int = 1024
+    chunk_overlap: int = 100
+    length_function: Callable[[str], int] = len
+    keep_separator: bool = False
+    add_start_index: bool = False
+    strip_whitespace: bool = True
+    @abstractmethod
+    def split_text(self, text: str) -> List[str]:
+        """
+        Split the input text into smaller chunks.
+        :param text: The input text to be split.
+        :return: A list of text chunks.
+        """
+    def create_chunks(
+        self, texts: List[str], metadatas: Optional[List[dict]] = None
+    ) -> List[Chunk]:
+        """Create chunks from a list of texts."""
+        _metadatas = metadatas or [{}] * len(texts)
+        chunks = []
+        for i, text in enumerate(texts):
+            index = 0
+            previous_chunk_len = 0
+            for chunk in self.split_text(text):
+                metadata = copy.deepcopy(_metadatas[i])
+                if self.add_start_index:
+                    offset = index + previous_chunk_len - self.chunk_overlap
+                    index = text.find(chunk, max(0, offset))
+                    metadata["start_index"] = index
+                    previous_chunk_len = len(chunk)
+                new_chunk = Chunk(content=chunk, metadata=metadata)
+                chunks.append(new_chunk)
+        return chunks
+    def _join_chunks(self, chunks: List[str], separator: str) -> Optional[str]:
+        text = separator.join(chunks)
+        if self.strip_whitespace:
+            text = text.strip()
+        if text == "":
+            return None
+        return text
+    def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
+        # We now want to combine these smaller pieces into medium size chunks to send to the LLM.
+        separator_len = self.length_function(separator)
+        chunks = []
+        current_chunk: List[str] = []
+        total = 0
+        for d in splits:
+            _len = self.length_function(d)
+            if (
+                total + _len + (separator_len if len(current_chunk) > 0 else 0)
+                > self.chunk_size
+            ):
+                if total > self.chunk_size:
+                    logger.warning(
+                        "Created a chunk of size %s, which is longer than the specified %s",
+                        total,
+                        self.chunk_size,
+                    )
+                if len(current_chunk) > 0:
+                    chunk = self._join_chunks(current_chunk, separator)
+                    if chunk is not None:
+                        chunks.append(chunk)
+                    # Keep on popping if:
+                    # - we have a larger chunk than in the chunk overlap
+                    # - or if we still have any chunks and the length is long
+                    while total > self.chunk_overlap or (
+                        total + _len + (separator_len if len(current_chunk) > 0 else 0)
+                        > self.chunk_size
+                        and total > 0
+                    ):
+                        total -= self.length_function(current_chunk[0]) + (
+                            separator_len if len(current_chunk) > 1 else 0
+                        )
+                        current_chunk = current_chunk[1:]
+            current_chunk.append(d)
+            total += _len + (separator_len if len(current_chunk) > 1 else 0)
+        chunk = self._join_chunks(current_chunk, separator)
+        if chunk is not None:
+            chunks.append(chunk)
+        return chunks
+    @staticmethod
+    def _split_text_with_regex(
+        text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
+    ) -> List[str]:
+        # Now that we have the separator, split the text
+        if separator:
+            if keep_separator:
+                # The parentheses in the pattern keep the delimiters in the result.
+                _splits = re.split(f"({separator})", text)
+                splits = (
+                    (
+                        [
+                            _splits[i] + _splits[i + 1]
+                            for i in range(0, len(_splits) - 1, 2)
+                        ]
+                    )
+                    if keep_separator == "end"
+                    else (
+                        [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
+                    )
+                )
+                if len(_splits) % 2 == 0:
+                    splits += _splits[-1:]
+                splits = (
+                    (splits + [_splits[-1]])
+                    if keep_separator == "end"
+                    else ([_splits[0]] + splits)
+                )
+            else:
+                splits = re.split(separator, text)
+        else:
+            splits = list(text)
+        return [s for s in splits if s != ""]

graphgen/bases/datatypes.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from dataclasses import dataclass, field
+@dataclass
+class Chunk:
+    id: str
+    content: str
+    metadata: dict = field(default_factory=dict)
+@dataclass
+class QAPair:
+    """
+    A pair of question and answer.
+    """
+    question: str
+    answer: str

graphgen/{models/text → configs}/__init__.py RENAMED Viewed

File without changes

graphgen/configs/aggregated_config.yaml CHANGED Viewed

@@ -1,4 +1,8 @@
-input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
 output_data_type: aggregated # atomic, aggregated, multi_hop, cot
 output_data_format: ChatML # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

+read:
+  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+split:
+  chunk_size: 1024 # chunk size for text splitting
+  chunk_overlap: 100 # chunk overlap for text splitting
 output_data_type: aggregated # atomic, aggregated, multi_hop, cot
 output_data_format: ChatML # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

graphgen/configs/atomic_config.yaml CHANGED Viewed

@@ -1,4 +1,8 @@
-input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
 output_data_type: atomic # atomic, aggregated, multi_hop, cot
 output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

+read:
+  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
+split:
+  chunk_size: 1024 # chunk size for text splitting
+  chunk_overlap: 100 # chunk overlap for text splitting
 output_data_type: atomic # atomic, aggregated, multi_hop, cot
 output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

graphgen/configs/cot_config.yaml CHANGED Viewed

@@ -1,4 +1,8 @@
-input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt. See resources/input_examples for examples
 output_data_type: cot # atomic, aggregated, multi_hop, cot
 output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

+read:
+  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt. See resources/input_examples for examples
+split:
+  chunk_size: 1024 # chunk size for text splitting
+  chunk_overlap: 100 # chunk overlap for text splitting
 output_data_type: cot # atomic, aggregated, multi_hop, cot
 output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

graphgen/configs/multi_hop_config.yaml CHANGED Viewed

@@ -1,4 +1,8 @@
-input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
 output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
 output_data_format: ChatML # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

+read:
+  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
+split:
+  chunk_size: 1024 # chunk size for text splitting
+  chunk_overlap: 100 # chunk overlap for text splitting
 output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
 output_data_format: ChatML # Alpaca, Sharegpt, ChatML
 tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path

graphgen/evaluate.py CHANGED Viewed

@@ -1,11 +1,15 @@
 """Evaluate the quality of the generated text using various metrics"""
-import os
-import json
 import argparse
 import pandas as pd
 from dotenv import load_dotenv
-from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, TextPair, UniEvaluator
 from .utils import logger, set_logger
 sys_path = os.path.abspath(os.path.dirname(__file__))
@@ -13,15 +17,15 @@ set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))
 load_dotenv()
 def evaluate_length(corpus, tokenizer_name):
-    length_evaluator = LengthEvaluator(
-        tokenizer_name=tokenizer_name
-    )
     logger.info("Length evaluator loaded")
     scores = length_evaluator.get_average_score(corpus)
     logger.info("Length scores: %s", scores)
     return scores
 def evaluate_mtld(corpus):
     mtld_evaluator = MTLDEvaluator()
     logger.info("MTLD evaluator loaded")
@@ -31,30 +35,30 @@ def evaluate_mtld(corpus):
     logger.info("MTLD min max scores: %s", min_max_scores)
     return scores, min_max_scores
 def evaluate_reward(corpus, reward_model_names):
     scores = []
     for reward_name in reward_model_names:
-        reward_evaluator = RewardEvaluator(
-            reward_name=reward_name
-        )
         logger.info("Loaded reward model: %s", reward_name)
         average_score = reward_evaluator.get_average_score(corpus)
         logger.info("%s scores: %s", reward_name, average_score)
         min_max_scores = reward_evaluator.get_min_max_score(corpus)
         logger.info("%s min max scores: %s", reward_name, min_max_scores)
-        scores.append({
-            'reward_name': reward_name.split('/')[-1],
-            'score': average_score,
-            'min_max_scores': min_max_scores
-        })
         del reward_evaluator
         clean_gpu_cache()
     return scores
 def evaluate_uni(corpus, uni_model_name):
-    uni_evaluator = UniEvaluator(
-        model_name=uni_model_name
-    )
     logger.info("Uni evaluator loaded with model %s", uni_model_name)
     uni_scores = uni_evaluator.get_average_score(corpus)
     for key, value in uni_scores.items():
@@ -64,27 +68,47 @@ def evaluate_uni(corpus, uni_model_name):
         logger.info("Uni %s min max scores: %s", key, value)
     del uni_evaluator
     clean_gpu_cache()
-    return (uni_scores['naturalness'], uni_scores['coherence'], uni_scores['understandability'],
-            min_max_scores['naturalness'], min_max_scores['coherence'], min_max_scores['understandability'])
 def clean_gpu_cache():
     import torch
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-if __name__ == '__main__':
     import torch.multiprocessing as mp
     parser = argparse.ArgumentParser()
-    parser.add_argument('--folder', type=str, default='cache/data', help='folder to load data')
-    parser.add_argument('--output', type=str, default='cache/output', help='path to save output')
-    parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name')
-    parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2',
-                        help='Comma-separated list of reward models')
-    parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name')
     args = parser.parse_args()
@@ -94,49 +118,55 @@ if __name__ == '__main__':
     if not os.path.exists(args.output):
         os.makedirs(args.output)
-    reward_models = args.reward.split(',')
     results = []
     logger.info("Data loaded from %s", args.folder)
-    mp.set_start_method('spawn')
     for file in os.listdir(args.folder):
-        if file.endswith('.json'):
             logger.info("Processing %s", file)
-            with open(os.path.join(args.folder, file), 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            data = [TextPair(
-                question=data[key]['question'],
-                answer=data[key]['answer']
-            ) for key in data]
             length_scores = evaluate_length(data, args.tokenizer)
             mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
             reward_scores = evaluate_reward(data, reward_models)
-            uni_naturalness_scores, uni_coherence_scores, uni_understandability_scores, \
-            min_max_uni_naturalness_scores, min_max_uni_coherence_scores, min_max_uni_understandability_scores \
-                = evaluate_uni(data, args.uni)
             result = {
-                'file': file,
-                'number': len(data),
-                'length': length_scores,
-                'mtld': mtld_scores,
-                'mtld_min_max': min_max_mtld_scores,
-                'uni_naturalness': uni_naturalness_scores,
-                'uni_coherence': uni_coherence_scores,
-                'uni_understandability': uni_understandability_scores,
-                'uni_naturalness_min_max': min_max_uni_naturalness_scores,
-                'uni_coherence_min_max': min_max_uni_coherence_scores,
-                'uni_understandability_min_max': min_max_uni_understandability_scores
             }
             for reward_score in reward_scores:
-                result[reward_score['reward_name']] = reward_score['score']
-                result[f"{reward_score['reward_name']}_min_max"] = reward_score['min_max_scores']
             results.append(result)
     results = pd.DataFrame(results)
-    results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False)

 """Evaluate the quality of the generated text using various metrics"""
 import argparse
+import json
+import os
 import pandas as pd
 from dotenv import load_dotenv
+from graphgen.bases.datatypes import QAPair
+from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, UniEvaluator
 from .utils import logger, set_logger
 sys_path = os.path.abspath(os.path.dirname(__file__))
 load_dotenv()
 def evaluate_length(corpus, tokenizer_name):
+    length_evaluator = LengthEvaluator(tokenizer_name=tokenizer_name)
     logger.info("Length evaluator loaded")
     scores = length_evaluator.get_average_score(corpus)
     logger.info("Length scores: %s", scores)
     return scores
 def evaluate_mtld(corpus):
     mtld_evaluator = MTLDEvaluator()
     logger.info("MTLD evaluator loaded")
     logger.info("MTLD min max scores: %s", min_max_scores)
     return scores, min_max_scores
 def evaluate_reward(corpus, reward_model_names):
     scores = []
     for reward_name in reward_model_names:
+        reward_evaluator = RewardEvaluator(reward_name=reward_name)
         logger.info("Loaded reward model: %s", reward_name)
         average_score = reward_evaluator.get_average_score(corpus)
         logger.info("%s scores: %s", reward_name, average_score)
         min_max_scores = reward_evaluator.get_min_max_score(corpus)
         logger.info("%s min max scores: %s", reward_name, min_max_scores)
+        scores.append(
+            {
+                "reward_name": reward_name.split("/")[-1],
+                "score": average_score,
+                "min_max_scores": min_max_scores,
+            }
+        )
         del reward_evaluator
         clean_gpu_cache()
     return scores
 def evaluate_uni(corpus, uni_model_name):
+    uni_evaluator = UniEvaluator(model_name=uni_model_name)
     logger.info("Uni evaluator loaded with model %s", uni_model_name)
     uni_scores = uni_evaluator.get_average_score(corpus)
     for key, value in uni_scores.items():
         logger.info("Uni %s min max scores: %s", key, value)
     del uni_evaluator
     clean_gpu_cache()
+    return (
+        uni_scores["naturalness"],
+        uni_scores["coherence"],
+        uni_scores["understandability"],
+        min_max_scores["naturalness"],
+        min_max_scores["coherence"],
+        min_max_scores["understandability"],
+    )
 def clean_gpu_cache():
     import torch
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+if __name__ == "__main__":
     import torch.multiprocessing as mp
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--folder", type=str, default="cache/data", help="folder to load data"
+    )
+    parser.add_argument(
+        "--output", type=str, default="cache/output", help="path to save output"
+    )
+    parser.add_argument(
+        "--tokenizer", type=str, default="cl100k_base", help="tokenizer name"
+    )
+    parser.add_argument(
+        "--reward",
+        type=str,
+        default="OpenAssistant/reward-model-deberta-v3-large-v2",
+        help="Comma-separated list of reward models",
+    )
+    parser.add_argument(
+        "--uni", type=str, default="MingZhong/unieval-sum", help="uni model name"
+    )
     args = parser.parse_args()
     if not os.path.exists(args.output):
         os.makedirs(args.output)
+    reward_models = args.reward.split(",")
     results = []
     logger.info("Data loaded from %s", args.folder)
+    mp.set_start_method("spawn")
     for file in os.listdir(args.folder):
+        if file.endswith(".json"):
             logger.info("Processing %s", file)
+            with open(os.path.join(args.folder, file), "r", encoding="utf-8") as f:
                 data = json.load(f)
+            data = [
+                QAPair(question=data[key]["question"], answer=data[key]["answer"])
+                for key in data
+            ]
             length_scores = evaluate_length(data, args.tokenizer)
             mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
             reward_scores = evaluate_reward(data, reward_models)
+            (
+                uni_naturalness_scores,
+                uni_coherence_scores,
+                uni_understandability_scores,
+                min_max_uni_naturalness_scores,
+                min_max_uni_coherence_scores,
+                min_max_uni_understandability_scores,
+            ) = evaluate_uni(data, args.uni)
             result = {
+                "file": file,
+                "number": len(data),
+                "length": length_scores,
+                "mtld": mtld_scores,
+                "mtld_min_max": min_max_mtld_scores,
+                "uni_naturalness": uni_naturalness_scores,
+                "uni_coherence": uni_coherence_scores,
+                "uni_understandability": uni_understandability_scores,
+                "uni_naturalness_min_max": min_max_uni_naturalness_scores,
+                "uni_coherence_min_max": min_max_uni_coherence_scores,
+                "uni_understandability_min_max": min_max_uni_understandability_scores,
             }
             for reward_score in reward_scores:
+                result[reward_score["reward_name"]] = reward_score["score"]
+                result[f"{reward_score['reward_name']}_min_max"] = reward_score[
+                    "min_max_scores"
+                ]
             results.append(result)
     results = pd.DataFrame(results)
+    results.to_csv(os.path.join(args.output, "evaluation.csv"), index=False)

graphgen/graphgen.py CHANGED Viewed

@@ -8,8 +8,8 @@ import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.bases.base_storage import StorageNameSpace
 from graphgen.models import (
-    Chunk,
     JsonKVStorage,
     JsonListStorage,
     NetworkXStorage,
@@ -17,6 +17,7 @@ from graphgen.models import (
     Tokenizer,
     TraverseStrategy,
     read_file,
 )
 from .operators import (
@@ -32,6 +33,7 @@ from .operators import (
 from .utils import (
     compute_content_hash,
     create_event_loop,
     format_generation_results,
     logger,
 )
@@ -50,11 +52,6 @@ class GraphGen:
     synthesizer_llm_client: OpenAIModel = None
     trainee_llm_client: OpenAIModel = None
-    # text chunking
-    # TODO: make it configurable
-    chunk_size: int = 1024
-    chunk_overlap_size: int = 100
     # search
     search_config: dict = field(
         default_factory=lambda: {"enabled": False, "search_types": ["wikipedia"]}
@@ -136,14 +133,22 @@ class GraphGen:
         async for doc_key, doc in tqdm_async(
             new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
         ):
             chunks = {
-                compute_content_hash(dp["content"], prefix="chunk-"): {
-                    **dp,
                     "full_doc_id": doc_key,
                 }
-                for dp in self.tokenizer_instance.chunk_by_token_size(
-                    doc["content"], self.chunk_overlap_size, self.chunk_size
-                )
             }
             inserting_chunks.update(chunks)
@@ -171,7 +176,7 @@ class GraphGen:
         insert chunks into the graph
         """
-        input_file = self.config["input_file"]
         data = read_file(input_file)
         inserting_chunks = await self.async_split_chunks(data)

 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.bases.base_storage import StorageNameSpace
+from graphgen.bases.datatypes import Chunk
 from graphgen.models import (
     JsonKVStorage,
     JsonListStorage,
     NetworkXStorage,
     Tokenizer,
     TraverseStrategy,
     read_file,
+    split_chunks,
 )
 from .operators import (
 from .utils import (
     compute_content_hash,
     create_event_loop,
+    detect_main_language,
     format_generation_results,
     logger,
 )
     synthesizer_llm_client: OpenAIModel = None
     trainee_llm_client: OpenAIModel = None
     # search
     search_config: dict = field(
         default_factory=lambda: {"enabled": False, "search_types": ["wikipedia"]}
         async for doc_key, doc in tqdm_async(
             new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
         ):
+            doc_language = detect_main_language(doc["content"])
+            text_chunks = split_chunks(
+                doc["content"],
+                language=doc_language,
+                chunk_size=self.config["split"]["chunk_size"],
+                chunk_overlap=self.config["split"]["chunk_overlap"],
+            )
             chunks = {
+                compute_content_hash(txt, prefix="chunk-"): {
+                    "content": txt,
                     "full_doc_id": doc_key,
+                    "length": len(self.tokenizer_instance.encode_string(txt)),
+                    "language": doc_language,
                 }
+                for txt in text_chunks
             }
             inserting_chunks.update(chunks)
         insert chunks into the graph
         """
+        input_file = self.config["read"]["input_file"]
         data = read_file(input_file)
         inserting_chunks = await self.async_split_chunks(data)

graphgen/models/__init__.py CHANGED Viewed

@@ -11,36 +11,7 @@ from .search.db.uniprot_search import UniProtSearch
 from .search.kg.wiki_search import WikiSearch
 from .search.web.bing_search import BingSearch
 from .search.web.google_search import GoogleSearch
 from .storage.json_storage import JsonKVStorage, JsonListStorage
 from .storage.networkx_storage import NetworkXStorage
 from .strategy.travserse_strategy import TraverseStrategy
-from .text.chunk import Chunk
-from .text.text_pair import TextPair
-__all__ = [
-    # llm models
-    "OpenAIModel",
-    "TopkTokenModel",
-    "Token",
-    "Tokenizer",
-    # storage models
-    "Chunk",
-    "NetworkXStorage",
-    "JsonKVStorage",
-    "JsonListStorage",
-    # search models
-    "WikiSearch",
-    "GoogleSearch",
-    "BingSearch",
-    "UniProtSearch",
-    # evaluate models
-    "TextPair",
-    "LengthEvaluator",
-    "MTLDEvaluator",
-    "RewardEvaluator",
-    "UniEvaluator",
-    # strategy models
-    "TraverseStrategy",
-    # community models
-    "CommunityDetector",
-]

 from .search.kg.wiki_search import WikiSearch
 from .search.web.bing_search import BingSearch
 from .search.web.google_search import GoogleSearch
+from .splitter import split_chunks
 from .storage.json_storage import JsonKVStorage, JsonListStorage
 from .storage.networkx_storage import NetworkXStorage
 from .strategy.travserse_strategy import TraverseStrategy

graphgen/models/evaluate/base_evaluator.py CHANGED Viewed

@@ -1,22 +1,24 @@
 import asyncio
 from dataclasses import dataclass
 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.utils import create_event_loop
-from graphgen.models.text.text_pair import TextPair
 @dataclass
 class BaseEvaluator:
     max_concurrent: int = 100
     results: list[float] = None
-    def evaluate(self, pairs: list[TextPair]) -> list[float]:
         """
         Evaluate the text and return a score.
         """
         return create_event_loop().run_until_complete(self.async_evaluate(pairs))
-    async def async_evaluate(self, pairs: list[TextPair]) -> list[float]:
         semaphore = asyncio.Semaphore(self.max_concurrent)
         async def evaluate_with_semaphore(pair):
@@ -31,10 +33,10 @@ class BaseEvaluator:
             results.append(await result)
         return results
-    async def evaluate_single(self, pair: TextPair) -> float:
         raise NotImplementedError()
-    def get_average_score(self, pairs: list[TextPair]) -> float:
         """
         Get the average score of a batch of texts.
         """
@@ -42,7 +44,7 @@ class BaseEvaluator:
         self.results = results
         return sum(self.results) / len(pairs)
-    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
         """
         Get the min and max score of a batch of texts.
         """

 import asyncio
 from dataclasses import dataclass
 from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.bases.datatypes import QAPair
 from graphgen.utils import create_event_loop
 @dataclass
 class BaseEvaluator:
     max_concurrent: int = 100
     results: list[float] = None
+    def evaluate(self, pairs: list[QAPair]) -> list[float]:
         """
         Evaluate the text and return a score.
         """
         return create_event_loop().run_until_complete(self.async_evaluate(pairs))
+    async def async_evaluate(self, pairs: list[QAPair]) -> list[float]:
         semaphore = asyncio.Semaphore(self.max_concurrent)
         async def evaluate_with_semaphore(pair):
             results.append(await result)
         return results
+    async def evaluate_single(self, pair: QAPair) -> float:
         raise NotImplementedError()
+    def get_average_score(self, pairs: list[QAPair]) -> float:
         """
         Get the average score of a batch of texts.
         """
         self.results = results
         return sum(self.results) / len(pairs)
+    def get_min_max_score(self, pairs: list[QAPair]) -> tuple[float, float]:
         """
         Get the min and max score of a batch of texts.
         """

graphgen/models/evaluate/length_evaluator.py CHANGED Viewed

@@ -1,19 +1,19 @@
 from dataclasses import dataclass
 from graphgen.models.evaluate.base_evaluator import BaseEvaluator
 from graphgen.models.llm.tokenizer import Tokenizer
-from graphgen.models.text.text_pair import TextPair
 from graphgen.utils import create_event_loop
 @dataclass
 class LengthEvaluator(BaseEvaluator):
     tokenizer_name: str = "cl100k_base"
     def __post_init__(self):
-        self.tokenizer = Tokenizer(
-            model_name=self.tokenizer_name
-        )
-    async def evaluate_single(self, pair: TextPair) -> float:
         loop = create_event_loop()
         return await loop.run_in_executor(None, self._calculate_length, pair.answer)

 from dataclasses import dataclass
+from graphgen.bases.datatypes import QAPair
 from graphgen.models.evaluate.base_evaluator import BaseEvaluator
 from graphgen.models.llm.tokenizer import Tokenizer
 from graphgen.utils import create_event_loop
 @dataclass
 class LengthEvaluator(BaseEvaluator):
     tokenizer_name: str = "cl100k_base"
     def __post_init__(self):
+        self.tokenizer = Tokenizer(model_name=self.tokenizer_name)
+    async def evaluate_single(self, pair: QAPair) -> float:
         loop = create_event_loop()
         return await loop.run_in_executor(None, self._calculate_length, pair.answer)

graphgen/models/evaluate/mtld_evaluator.py CHANGED Viewed

@@ -1,22 +1,27 @@
-from dataclasses import  dataclass, field
 from typing import Set
 from graphgen.models.evaluate.base_evaluator import BaseEvaluator
-from graphgen.models.text.text_pair import TextPair
-from graphgen.utils import detect_main_language, NLTKHelper, create_event_loop
 nltk_helper = NLTKHelper()
 @dataclass
 class MTLDEvaluator(BaseEvaluator):
     """
     衡量文本词汇多样性的指标
     """
-    stopwords_en: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("english")))
-    stopwords_zh: Set[str] = field(default_factory=lambda: set(nltk_helper.get_stopwords("chinese")))
-    async def evaluate_single(self, pair: TextPair) -> float:
         loop = create_event_loop()
         return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer)
@@ -71,6 +76,6 @@ class MTLDEvaluator(BaseEvaluator):
             if ttr <= threshold:
                 factors += 1
             else:
-                factors += (1 - (ttr - threshold) / (1 - threshold))
         return len(tokens) / factors if factors > 0 else len(tokens)

+from dataclasses import dataclass, field
 from typing import Set
+from graphgen.bases.datatypes import QAPair
 from graphgen.models.evaluate.base_evaluator import BaseEvaluator
+from graphgen.utils import NLTKHelper, create_event_loop, detect_main_language
 nltk_helper = NLTKHelper()
 @dataclass
 class MTLDEvaluator(BaseEvaluator):
     """
     衡量文本词汇多样性的指标
     """
+    stopwords_en: Set[str] = field(
+        default_factory=lambda: set(nltk_helper.get_stopwords("english"))
+    )
+    stopwords_zh: Set[str] = field(
+        default_factory=lambda: set(nltk_helper.get_stopwords("chinese"))
+    )
+    async def evaluate_single(self, pair: QAPair) -> float:
         loop = create_event_loop()
         return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer)
             if ttr <= threshold:
                 factors += 1
             else:
+                factors += 1 - (ttr - threshold) / (1 - threshold)
         return len(tokens) / factors if factors > 0 else len(tokens)

graphgen/models/evaluate/reward_evaluator.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from dataclasses import dataclass
 from tqdm import tqdm
-from graphgen.models.text.text_pair import TextPair
 @dataclass
@@ -9,19 +11,22 @@ class RewardEvaluator:
     Reward Model Evaluator.
     OpenAssistant/reward-model-deberta-v3-large-v2: 分数范围为[-inf, inf]，越高越好
     """
     reward_name: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
     max_length: int = 2560
     results: list[float] = None
     def __post_init__(self):
         import torch
         self.num_gpus = torch.cuda.device_count()
     @staticmethod
     def process_chunk(rank, pairs, reward_name, max_length, return_dict):
         import torch
         from transformers import AutoModelForSequenceClassification, AutoTokenizer
-        device = f'cuda:{rank}'
         torch.cuda.set_device(rank)
         rank_model = AutoModelForSequenceClassification.from_pretrained(reward_name)
@@ -37,7 +42,7 @@ class RewardEvaluator:
                     pair.answer,
                     return_tensors="pt",
                     max_length=max_length,
-                    truncation=True
                 )
                 inputs = {k: v.to(device) for k, v in inputs.items()}
                 score = rank_model(**inputs).logits[0].item()
@@ -45,8 +50,9 @@ class RewardEvaluator:
         return_dict[rank] = results
-    def evaluate(self, pairs: list[TextPair]) -> list[float]:
         import torch.multiprocessing as mp
         chunk_size = len(pairs) // self.num_gpus
         chunks = []
         for i in range(self.num_gpus):
@@ -64,7 +70,7 @@ class RewardEvaluator:
         for rank, chunk in enumerate(chunks):
             p = mp.Process(
                 target=self.process_chunk,
-                args=(rank, chunk, self.reward_name, self.max_length, return_dict)
             )
             p.start()
             processes.append(p)
@@ -84,7 +90,7 @@ class RewardEvaluator:
         return results
-    def get_average_score(self, pairs: list[TextPair]) -> float:
         """
         Get the average score of a batch of texts.
         """
@@ -92,7 +98,7 @@ class RewardEvaluator:
         self.results = results
         return sum(self.results) / len(pairs)
-    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
         """
         Get the min and max score of a batch of texts.
         """

 from dataclasses import dataclass
 from tqdm import tqdm
+from graphgen.bases.datatypes import QAPair
 @dataclass
     Reward Model Evaluator.
     OpenAssistant/reward-model-deberta-v3-large-v2: 分数范围为[-inf, inf]，越高越好
     """
     reward_name: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
     max_length: int = 2560
     results: list[float] = None
     def __post_init__(self):
         import torch
         self.num_gpus = torch.cuda.device_count()
     @staticmethod
     def process_chunk(rank, pairs, reward_name, max_length, return_dict):
         import torch
         from transformers import AutoModelForSequenceClassification, AutoTokenizer
+        device = f"cuda:{rank}"
         torch.cuda.set_device(rank)
         rank_model = AutoModelForSequenceClassification.from_pretrained(reward_name)
                     pair.answer,
                     return_tensors="pt",
                     max_length=max_length,
+                    truncation=True,
                 )
                 inputs = {k: v.to(device) for k, v in inputs.items()}
                 score = rank_model(**inputs).logits[0].item()
         return_dict[rank] = results
+    def evaluate(self, pairs: list[QAPair]) -> list[float]:
         import torch.multiprocessing as mp
         chunk_size = len(pairs) // self.num_gpus
         chunks = []
         for i in range(self.num_gpus):
         for rank, chunk in enumerate(chunks):
             p = mp.Process(
                 target=self.process_chunk,
+                args=(rank, chunk, self.reward_name, self.max_length, return_dict),
             )
             p.start()
             processes.append(p)
         return results
+    def get_average_score(self, pairs: list[QAPair]) -> float:
         """
         Get the average score of a batch of texts.
         """
         self.results = results
         return sum(self.results) / len(pairs)
+    def get_min_max_score(self, pairs: list[QAPair]) -> tuple[float, float]:
         """
         Get the min and max score of a batch of texts.
         """

graphgen/models/evaluate/uni_evaluator.py CHANGED Viewed

@@ -1,40 +1,58 @@
 # https://github.com/maszhongming/UniEval/tree/main
 from dataclasses import dataclass, field
 from tqdm import tqdm
-from graphgen.models.text.text_pair import TextPair
 def _add_questions(dimension: str, question: str, answer: str):
     if dimension == "naturalness":
-        cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + answer
     elif dimension == "coherence":
-        cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: ' \
-                    + answer + ' </s> dialogue history: ' + question
     elif dimension == "understandability":
-        cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + answer
     else:
         raise NotImplementedError(
-            'The input format for this dimension is still undefined. Please customize it first.')
     return cur_input
 @dataclass
 class UniEvaluator:
     model_name: str = "MingZhong/unieval-sum"
-    dimensions: list = field(default_factory=lambda: ['naturalness', 'coherence', 'understandability'])
     max_length: int = 2560
     results: dict = None
     def __post_init__(self):
         import torch
         self.num_gpus = torch.cuda.device_count()
         self.results = {}
     @staticmethod
     def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):
         import torch
-        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-        device = f'cuda:{rank}'
         torch.cuda.set_device(rank)
         rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -59,26 +77,26 @@ class UniEvaluator:
                     max_length=max_length,
                     truncation=True,
                     padding=True,
-                    return_tensors='pt'
                 )
                 encoded_tgt = tokenizer(
                     tgt,
                     max_length=max_length,
                     truncation=True,
                     padding=True,
-                    return_tensors='pt'
                 )
-                src_tokens = encoded_src['input_ids'].to(device)
-                src_mask = encoded_src['attention_mask'].to(device)
-                tgt_tokens = encoded_tgt['input_ids'].to(device)[:, 0].unsqueeze(-1)
                 output = rank_model(
                     input_ids=src_tokens,
                     attention_mask=src_mask,
                     labels=tgt_tokens,
-                    use_cache = False
                 )
                 logits = output.logits.view(-1, rank_model.config.vocab_size)
@@ -91,8 +109,9 @@ class UniEvaluator:
         return_dict[rank] = results
-    def evaluate(self, pairs: list[TextPair]) -> list[dict]:
         import torch.multiprocessing as mp
         final_results = []
         for dimension in self.dimensions:
             chunk_size = len(pairs) // self.num_gpus
@@ -112,7 +131,14 @@ class UniEvaluator:
             for rank, chunk in enumerate(chunks):
                 p = mp.Process(
                     target=self.process_chunk,
-                    args=(rank, chunk, self.model_name, self.max_length, dimension, return_dict)
                 )
                 p.start()
                 processes.append(p)
@@ -130,12 +156,10 @@ class UniEvaluator:
                     p.terminate()
                     p.join()
-            final_results.append({
-                dimension: results
-            })
         return final_results
-    def get_average_score(self, pairs: list[TextPair]) -> dict:
         """
         Get the average score of a batch of texts.
         """
@@ -147,7 +171,7 @@ class UniEvaluator:
                 self.results[key] = value
         return final_results
-    def get_min_max_score(self, pairs: list[TextPair]) -> dict:
         """
         Get the min and max score of a batch of texts.
         """

 # https://github.com/maszhongming/UniEval/tree/main
 from dataclasses import dataclass, field
 from tqdm import tqdm
+from graphgen.bases.datatypes import QAPair
 def _add_questions(dimension: str, question: str, answer: str):
     if dimension == "naturalness":
+        cur_input = (
+            "question: Is this a natural response in the dialogue? </s> response: "
+            + answer
+        )
     elif dimension == "coherence":
+        cur_input = (
+            "question: Is this a coherent response given the dialogue history? </s> response: "
+            + answer
+            + " </s> dialogue history: "
+            + question
+        )
     elif dimension == "understandability":
+        cur_input = (
+            "question: Is this an understandable response in the dialogue? </s> response: "
+            + answer
+        )
     else:
         raise NotImplementedError(
+            "The input format for this dimension is still undefined. Please customize it first."
+        )
     return cur_input
 @dataclass
 class UniEvaluator:
     model_name: str = "MingZhong/unieval-sum"
+    dimensions: list = field(
+        default_factory=lambda: ["naturalness", "coherence", "understandability"]
+    )
     max_length: int = 2560
     results: dict = None
     def __post_init__(self):
         import torch
         self.num_gpus = torch.cuda.device_count()
         self.results = {}
     @staticmethod
     def process_chunk(rank, pairs, model_name, max_length, dimension, return_dict):
         import torch
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+        device = f"cuda:{rank}"
         torch.cuda.set_device(rank)
         rank_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
                     max_length=max_length,
                     truncation=True,
                     padding=True,
+                    return_tensors="pt",
                 )
                 encoded_tgt = tokenizer(
                     tgt,
                     max_length=max_length,
                     truncation=True,
                     padding=True,
+                    return_tensors="pt",
                 )
+                src_tokens = encoded_src["input_ids"].to(device)
+                src_mask = encoded_src["attention_mask"].to(device)
+                tgt_tokens = encoded_tgt["input_ids"].to(device)[:, 0].unsqueeze(-1)
                 output = rank_model(
                     input_ids=src_tokens,
                     attention_mask=src_mask,
                     labels=tgt_tokens,
+                    use_cache=False,
                 )
                 logits = output.logits.view(-1, rank_model.config.vocab_size)
         return_dict[rank] = results
+    def evaluate(self, pairs: list[QAPair]) -> list[dict]:
         import torch.multiprocessing as mp
         final_results = []
         for dimension in self.dimensions:
             chunk_size = len(pairs) // self.num_gpus
             for rank, chunk in enumerate(chunks):
                 p = mp.Process(
                     target=self.process_chunk,
+                    args=(
+                        rank,
+                        chunk,
+                        self.model_name,
+                        self.max_length,
+                        dimension,
+                        return_dict,
+                    ),
                 )
                 p.start()
                 processes.append(p)
                     p.terminate()
                     p.join()
+            final_results.append({dimension: results})
         return final_results
+    def get_average_score(self, pairs: list[QAPair]) -> dict:
         """
         Get the average score of a batch of texts.
         """
                 self.results[key] = value
         return final_results
+    def get_min_max_score(self, pairs: list[QAPair]) -> dict:
         """
         Get the min and max score of a batch of texts.
         """

graphgen/models/splitter/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from functools import lru_cache
+from typing import Union
+from .recursive_character_splitter import (
+    ChineseRecursiveTextSplitter,
+    RecursiveCharacterSplitter,
+)
+_MAPPING = {
+    "en": RecursiveCharacterSplitter,
+    "zh": ChineseRecursiveTextSplitter,
+}
+SplitterT = Union[RecursiveCharacterSplitter, ChineseRecursiveTextSplitter]
+@lru_cache(maxsize=None)
+def _get_splitter(language: str, frozen_kwargs: frozenset) -> SplitterT:
+    cls = _MAPPING[language]
+    kwargs = dict(frozen_kwargs)
+    return cls(**kwargs)
+def split_chunks(text: str, language: str = "en", **kwargs) -> list:
+    if language not in _MAPPING:
+        raise ValueError(
+            f"Unsupported language: {language}. "
+            f"Supported languages are: {list(_MAPPING.keys())}"
+        )
+    splitter = _get_splitter(language, frozenset(kwargs.items()))
+    return splitter.split_text(text)

graphgen/models/splitter/character_splitter.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import re
+from typing import Any, List
+from graphgen.bases.base_splitter import BaseSplitter
+class CharacterSplitter(BaseSplitter):
+    """Splitting text that looks at characters."""
+    def __init__(
+        self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(**kwargs)
+        self._separator = separator
+        self._is_separator_regex = is_separator_regex
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        # First we naively split the large input into a bunch of smaller ones.
+        separator = (
+            self._separator if self._is_separator_regex else re.escape(self._separator)
+        )
+        splits = self._split_text_with_regex(text, separator, self.keep_separator)
+        _separator = "" if self.keep_separator else self._separator
+        return self._merge_splits(splits, _separator)

graphgen/models/splitter/markdown_splitter.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import Any
+from graphgen.models.splitter.recursive_character_splitter import (
+    RecursiveCharacterSplitter,
+)
+class MarkdownTextRefSplitter(RecursiveCharacterSplitter):
+    """Attempts to split the text along Markdown-formatted headings."""
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize a MarkdownTextRefSplitter."""
+        separators = [
+            # First, try to split along Markdown headings (starting with level 2)
+            "\n#{1,6} ",
+            # Note the alternative syntax for headings (below) is not handled here
+            # Heading level 2
+            # ---------------
+            # End of code block
+            "```\n",
+            # Horizontal lines
+            "\n\\*\\*\\*+\n",
+            "\n---+\n",
+            "\n___+\n",
+            # Note: horizontal lines defined by three or more of ***, ---, or ___
+            # are handled by the regexes above, but alternative syntaxes (e.g., with spaces)
+            # are not handled.
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)

graphgen/models/splitter/recursive_character_splitter.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import re
+from typing import Any, List, Optional
+from graphgen.bases.base_splitter import BaseSplitter
+class RecursiveCharacterSplitter(BaseSplitter):
+    """Splitting text by recursively look at characters.
+    Recursively tries to split by different characters to find one that works.
+    """
+    def __init__(
+        self,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+        is_separator_regex: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or ["\n\n", "\n", " ", ""]
+        self._is_separator_regex = is_separator_regex
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = self._split_text_with_regex(text, _separator, self.keep_separator)
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self.keep_separator else separator
+        for s in splits:
+            if self.length_function(s) < self.chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return final_chunks
+    def split_text(self, text: str) -> List[str]:
+        return self._split_text(text, self._separators)
+class ChineseRecursiveTextSplitter(RecursiveCharacterSplitter):
+    def __init__(
+        self,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+        is_separator_regex: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or [
+            "\n\n",
+            "\n",
+            "。|！|？",
+            r"\.\s|\!\s|\?\s",
+            r"；|;\s",
+            r"，|,\s",
+        ]
+        self._is_separator_regex = is_separator_regex
+    def _split_text_with_regex_from_end(
+        self, text: str, separator: str, keep_separator: bool
+    ) -> List[str]:
+        # Now that we have the separator, split the text
+        if separator:
+            if keep_separator:
+                # The parentheses in the pattern keep the delimiters in the result.
+                _splits = re.split(f"({separator})", text)
+                splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
+                if len(_splits) % 2 == 1:
+                    splits += _splits[-1:]
+            else:
+                splits = re.split(separator, text)
+        else:
+            splits = list(text)
+        return [s for s in splits if s != ""]
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = self._split_text_with_regex_from_end(
+            text, _separator, self.keep_separator
+        )
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self.keep_separator else separator
+        for s in splits:
+            if self.length_function(s) < self.chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return [
+            re.sub(r"\n{2,}", "\n", chunk.strip())
+            for chunk in final_chunks
+            if chunk.strip() != ""
+        ]

graphgen/models/text/chunk.py DELETED Viewed

@@ -1,7 +0,0 @@
-from dataclasses import dataclass
-@dataclass
-class Chunk:
-    id : str
-    content: str

graphgen/models/text/text_pair.py DELETED Viewed

@@ -1,9 +0,0 @@
-from dataclasses import dataclass
-@dataclass
-class TextPair:
-    """
-    A pair of input data.
-    """
-    question: str
-    answer: str

graphgen/operators/kg/extract_kg.py CHANGED Viewed

@@ -7,7 +7,8 @@ import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.bases.base_storage import BaseGraphStorage
-from graphgen.models import Chunk, OpenAIModel, Tokenizer
 from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
 from graphgen.templates import KG_EXTRACTION_PROMPT
 from graphgen.utils import (

 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import OpenAIModel, Tokenizer
 from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
 from graphgen.templates import KG_EXTRACTION_PROMPT
 from graphgen.utils import (

graphgen/operators/preprocess/resolute_coreference.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List
-from graphgen.models import Chunk, OpenAIModel
 from graphgen.templates import COREFERENCE_RESOLUTION_PROMPT
 from graphgen.utils import detect_main_language

 from typing import List
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import OpenAIModel
 from graphgen.templates import COREFERENCE_RESOLUTION_PROMPT
 from graphgen.utils import detect_main_language

webui/app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from graphgen.graphgen import GraphGen
 from graphgen.models import OpenAIModel, Tokenizer
 from graphgen.models.llm.limitter import RPM, TPM
 from graphgen.utils import set_logger
-from webui.base import GraphGenParams
 from webui.cache_utils import cleanup_workspace, setup_workspace
 from webui.count_tokens import count_tokens
 from webui.i18n import Translate
@@ -66,13 +66,19 @@ def init_graph_gen(config: dict, env: dict) -> GraphGen:
 # pylint: disable=too-many-statements
-def run_graphgen(params, progress=gr.Progress()):
     def sum_tokens(client):
         return sum(u["total_tokens"] for u in client.token_usage)
     config = {
         "if_trainee_model": params.if_trainee_model,
-        "input_file": params.input_file,
         "output_data_type": params.output_data_type,
         "output_data_format": params.output_data_format,
         "tokenizer": params.tokenizer,
@@ -91,7 +97,6 @@ def run_graphgen(params, progress=gr.Progress()):
             "isolated_node_strategy": params.isolated_node_strategy,
             "loss_strategy": params.loss_strategy,
         },
-        "chunk_size": params.chunk_size,
     }
     env = {
@@ -284,10 +289,18 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
                 label="Chunk Size",
                 minimum=256,
                 maximum=4096,
-                value=512,
                 step=256,
                 interactive=True,
             )
             tokenizer = gr.Textbox(
                 label="Tokenizer", value="cl100k_base", interactive=True
             )
@@ -499,7 +512,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
         submit_btn.click(
             lambda *args: run_graphgen(
-                GraphGenParams(
                     if_trainee_model=args[0],
                     input_file=args[1],
                     tokenizer=args[2],
@@ -518,12 +531,13 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
                     trainee_model=args[15],
                     api_key=args[16],
                     chunk_size=args[17],
-                    rpm=args[18],
-                    tpm=args[19],
-                    quiz_samples=args[20],
-                    trainee_url=args[21],
-                    trainee_api_key=args[22],
-                    token_counter=args[23],
                 )
             ),
             inputs=[
@@ -545,6 +559,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
                 trainee_model,
                 api_key,
                 chunk_size,
                 rpm,
                 tpm,
                 quiz_samples,

 from graphgen.models import OpenAIModel, Tokenizer
 from graphgen.models.llm.limitter import RPM, TPM
 from graphgen.utils import set_logger
+from webui.base import WebuiParams
 from webui.cache_utils import cleanup_workspace, setup_workspace
 from webui.count_tokens import count_tokens
 from webui.i18n import Translate
 # pylint: disable=too-many-statements
+def run_graphgen(params: WebuiParams, progress=gr.Progress()):
     def sum_tokens(client):
         return sum(u["total_tokens"] for u in client.token_usage)
     config = {
         "if_trainee_model": params.if_trainee_model,
+        "read": {
+            "input_file": params.input_file,
+        },
+        "split": {
+            "chunk_size": params.chunk_size,
+            "chunk_overlap": params.chunk_overlap,
+        },
         "output_data_type": params.output_data_type,
         "output_data_format": params.output_data_format,
         "tokenizer": params.tokenizer,
             "isolated_node_strategy": params.isolated_node_strategy,
             "loss_strategy": params.loss_strategy,
         },
     }
     env = {
                 label="Chunk Size",
                 minimum=256,
                 maximum=4096,
+                value=1024,
                 step=256,
                 interactive=True,
             )
+            chunk_overlap = gr.Slider(
+                label="Chunk Overlap",
+                minimum=0,
+                maximum=500,
+                value=100,
+                step=100,
+                interactive=True,
+            )
             tokenizer = gr.Textbox(
                 label="Tokenizer", value="cl100k_base", interactive=True
             )
         submit_btn.click(
             lambda *args: run_graphgen(
+                WebuiParams(
                     if_trainee_model=args[0],
                     input_file=args[1],
                     tokenizer=args[2],
                     trainee_model=args[15],
                     api_key=args[16],
                     chunk_size=args[17],
+                    chunk_overlap=args[18],
+                    rpm=args[19],
+                    tpm=args[20],
+                    quiz_samples=args[21],
+                    trainee_url=args[22],
+                    trainee_api_key=args[23],
+                    token_counter=args[24],
                 )
             ),
             inputs=[
                 trainee_model,
                 api_key,
                 chunk_size,
+                chunk_overlap,
                 rpm,
                 tpm,
                 quiz_samples,

webui/base.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any
 @dataclass
-class GraphGenParams:
     """
     GraphGen parameters
     """
@@ -26,6 +26,7 @@ class GraphGenParams:
     trainee_model: str
     api_key: str
     chunk_size: int
     rpm: int
     tpm: int
     quiz_samples: int

 @dataclass
+class WebuiParams:
     """
     GraphGen parameters
     """
     trainee_model: str
     api_key: str
     chunk_size: int
+    chunk_overlap: int
     rpm: int
     tpm: int
     quiz_samples: int