Spaces:

Cat125
/

text-generator-v2

Running

App Files Files Community

Cat0125 commited on Aug 15, 2023

Commit

94d6d2b

1 Parent(s): 5f3fb7b

Update models & training system

Browse files

Files changed (10) hide show

datamanager.py +4 -1
main.py +3 -35
models/en/data.pkl +2 -2
models/en/data3.pkl +2 -2
models/ru-lg/data.pkl +2 -2
models/ru-lg/data3.pkl +2 -2
models/ru-lg/text.txt +0 -0
models/ru-lite/data.pkl +1 -1
models/ru-lite/data3.pkl +1 -1
train.py +8 -13

datamanager.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import pickle
-from files import read_lines
 models = json.load(open("models/models.json"))
 TEXT_PATH = 'models/%s/text.txt'
@@ -21,6 +21,9 @@ def get_texts(model_name):
     """
     return read_lines(TEXT_PATH % model_name)
 def set_data(model_name, data):
     """
     This function saves data to a file using the pickle module, with the filename specified by the

 import json
 import pickle
+from files import read_lines, read_file
 models = json.load(open("models/models.json"))
 TEXT_PATH = 'models/%s/text.txt'
     """
     return read_lines(TEXT_PATH % model_name)
+def get_text(model_name):
+    return read_file(TEXT_PATH % model_name)
 def set_data(model_name, data):
     """
     This function saves data to a file using the pickle module, with the filename specified by the

main.py CHANGED Viewed

@@ -13,25 +13,6 @@ WEIGHTS_MAP = [
 ]
 def get_next_word_results(db:dict, message:str, prev_word:str, text:str, repeat:int = 0):
-    """
-    This function takes in a database, a message, a previous word, and an optional repeat count, and
-    returns a list of tokens from the database that match the previous word and have a score based on
-    their context in the message.
-    :param db: a dictionary containing information about words and their contexts
-    :param message: a string representing the message or text being analyzed
-    :type message: str
-    :param prev_word: The previous word that we want to find the next word(s) for
-    :type prev_word: str
-    :param repeat: The repeat parameter is an optional integer parameter that specifies how many times
-    the previous word can be repeated in the message before it is no longer considered a valid context
-    for the next word. If repeat is set to 0, then there is no limit on the number of times the previous
-    word can be repeated, defaults to 0
-    :type repeat: int (optional)
-    :return: a list of Token objects that are the next possible words in a given message based on the
-    previous word and its contexts in a database. If the previous word is not in the database, an empty
-    list is returned.
-    """
     results = []
     if prev_word not in db:
         return []
@@ -94,7 +75,7 @@ def generator(user_message, word_count, mode, model_name):
             yield text
             break
         if i == 0 and text.strip() == '.':
-            raise gr.Error("Error in generating. Try to use another prompt")
         i += 1
         yield text.strip()
@@ -103,11 +84,8 @@ demo = gr.Blocks(
 )
 title_html = """
-<center>
-    <h1>Text Generator v2</h1>
-    <p>Generates text using per-word context system</p>
-    <a href="http://j93153xm.beget.tech/app/index.html?id=text-ai"><img src="https://img.shields.io/badge/Text%20Generator%20v1-RU%20only-brightgreen"></a>
-</center>
 """
 info_text = """
 # Information about the models
@@ -127,16 +105,6 @@ info_text = """
 `Language`: Russian
 `Quality`: 7-8/10
 `Sources`: http://staging.budsvetom.com/literature_items/ochen-dlinnyy-tekst
-# Training
-```bash
-python train.py -r <models to train> [-t] [-l ...]
-```
-`--rebuild` (`-r`) - Models that will be trained.
-`--turbo` (`-t`) - Enables turbo training. Will skip morphological analysis and just add all words directly.
-`--log` (`-l`) - Logs listed databases to the console after training.
-> **Note:** Use `--turbo` only when training with Russian texts.
 """
 with demo:
     gr.HTML(title_html)

 ]
 def get_next_word_results(db:dict, message:str, prev_word:str, text:str, repeat:int = 0):
     results = []
     if prev_word not in db:
         return []
             yield text
             break
         if i == 0 and text.strip() == '.':
+            raise gr.Error("Error while generating. Please try again.")
         i += 1
         yield text.strip()
 )
 title_html = """
+<h1>Text Generator v2</h1>
+<a href="http://j93153xm.beget.tech/app/index.html?id=text-ai"><img src="https://img.shields.io/badge/Text%20Generator%20v1-RU%20only-brightgreen"></a>
 """
 info_text = """
 # Information about the models
 `Language`: Russian
 `Quality`: 7-8/10
 `Sources`: http://staging.budsvetom.com/literature_items/ochen-dlinnyy-tekst
 """
 with demo:
     gr.HTML(title_html)

models/en/data.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a44379910fb94a41548dc22de9d5c94d31b74a71d50951804c9f4b904311ae9
-size 892450

 version https://git-lfs.github.com/spec/v1
+oid sha256:decf142f192f3ae9576f87a8aa119e39dd852317c8bdba2d83fc4eddebb3cc3b
+size 3717733

models/en/data3.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc6f7a452bcc7aa0e6d9508a38c498c23df71e430bb26e9452e3492e464e786e
-size 926524

 version https://git-lfs.github.com/spec/v1
+oid sha256:707e9ea7c67970a84c25f068aa74c130b45b11d1235d9e59efead924c6efd3a7
+size 3698343

models/ru-lg/data.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a37fc4eab3fb56240c4db1e4e0011f29b4f7c454dd2c723ce38b36ccbc38da25
-size 3436464

 version https://git-lfs.github.com/spec/v1
+oid sha256:fa404e61cfc8a102e228335ed384af25fae86ecead0143acbff67954adde0bb7
+size 3997228

models/ru-lg/data3.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:722b3c087d60a91f35333faeb2bc98ca5d609aebfabcfd7ea57c8561d29fcda2
-size 3449818

 version https://git-lfs.github.com/spec/v1
+oid sha256:f839b88e09df9f6fb3b010cdee4945ead7b8385d252cb0f30529d3aa5229d8eb
+size 4022056

models/ru-lg/text.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/ru-lite/data.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13461ea05cfe7f067013241ade45a97bfa104ce90ea4f4b3edfaf21e35beda92
 size 560516

 version https://git-lfs.github.com/spec/v1
+oid sha256:63b42b3a11e0216544a7fca69986557e246cfb0cd773c2ca6687932fe9ede410
 size 560516

models/ru-lite/data3.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f38a913d682d96840deab6b5a9468539c4373c5b14ab3eb1ef7009c2c1dc8dc9
 size 573246

 version https://git-lfs.github.com/spec/v1
+oid sha256:a8bb5f68b452a5cacbdbac92b096f3cfd8fe18ca6012951a4f42ee21615dcf17
 size 573246

train.py CHANGED Viewed

@@ -5,20 +5,21 @@ from pprint import pprint
 from tqdm import tqdm
 from classes import Token
-from datamanager import get_data, get_texts, models, set_data, set_data_v3
 turbo = False
 def normalize_text(sentence):
     sentence = sentence.strip()
     sentence = re.sub(r'\s+([.,!?;:])', r'\1', sentence)
     sentence = re.sub(r'([.,!?;:])(\S)', r'\1 \2', sentence)
     sentence = re.sub(r'\s+\'|\'\s+', '\'', sentence)
     sentence = re.sub(r'\s+', ' ', sentence)
     return sentence
-def process_sentence(db, db3, sentence:str, text):
     words = sentence.strip().split()
     for i in range(len(words)):
         word = words[i].strip()
@@ -39,15 +40,9 @@ def train(model_name):
     db = []
     db3 = {}
     print(f'Rebuilding database for "{model_name}"...')
-    k = 0
-    texts = get_texts(model_name)
-    total_texts = len(texts)
-    for text in texts:
-        k += 1
-        print(f'Processing text {k} of {total_texts}...')
-        text = normalize_text(text)
-        process_text(db, db3, text)
     set_data(model_name, db)
     models[model_name]["db"] = db
     set_data_v3(model_name, db3)
@@ -55,8 +50,8 @@ def train(model_name):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
-        prog='Text Generator v2',
-        description='Generates text from a text file')
     parser.add_argument('-r', '--rebuild', action='extend', nargs="+", type=str)
     parser.add_argument('-l', '--log', action='extend', nargs="+", type=str)
     parser.add_argument('-t', '--turbo', action='store_true')

 from tqdm import tqdm
 from classes import Token
+from datamanager import get_data, get_text, models, set_data, set_data_v3
 turbo = False
 def normalize_text(sentence):
     sentence = sentence.strip()
+    sentence = re.sub(r'(\s+|\n+)', ' ', sentence)
     sentence = re.sub(r'\s+([.,!?;:])', r'\1', sentence)
     sentence = re.sub(r'([.,!?;:])(\S)', r'\1 \2', sentence)
     sentence = re.sub(r'\s+\'|\'\s+', '\'', sentence)
     sentence = re.sub(r'\s+', ' ', sentence)
     return sentence
+def process_sentence(db, db3, sentence, text):
     words = sentence.strip().split()
     for i in range(len(words)):
         word = words[i].strip()
     db = []
     db3 = {}
     print(f'Rebuilding database for "{model_name}"...')
+    text = get_text(model_name)
+    text = normalize_text(text)
+    process_text(db, db3, text)
     set_data(model_name, db)
     models[model_name]["db"] = db
     set_data_v3(model_name, db3)
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
+        prog='Train',
+        description='Training system for Text Generator v2')
     parser.add_argument('-r', '--rebuild', action='extend', nargs="+", type=str)
     parser.add_argument('-l', '--log', action='extend', nargs="+", type=str)
     parser.add_argument('-t', '--turbo', action='store_true')