Doron Adler commited on
Commit
f4b7889
·
1 Parent(s): d695880

מחולל קטעים מהרצאות טד

Browse files
.gitattributes CHANGED
@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ model/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: SillyHebrewTedTalkSnippetGenerator
3
- emoji: 💻
4
- colorFrom: pink
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.1.7
8
  app_file: app.py
 
1
  ---
2
+ title: מחולל קטעים מהרצאות טד
3
+ emoji: 🧑‍🏫
4
+ colorFrom: green
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.1.7
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+
4
+ title = "מחולל קטעים מהרצאות טד"
5
+ description = ""
6
+ article = "<p>Fine tuned <a href=\"https://huggingface.co/Norod78/hebrew-gpt_neo-small\">Norod78/hebrew-gpt_neo-small</a> upon a formatted <a href=\"https://www.kaggle.com/datasets/miguelcorraljr/ted-ultimate-dataset\"> TED – Ultimate Dataset</a> (Hebrew)</p>"
7
+
8
+ model_id = "./model"
9
+ text_generator = pipeline('text-generation', model=model_id, tokenizer=model_id)
10
+ max_length = 128
11
+ top_k = 40
12
+ top_p = 0.92
13
+ temperature = 1.0
14
+
15
+ def text_generation(input_text = None):
16
+ if input_text == None or len(input_text) == 0:
17
+ input_text = "\t\""
18
+ else:
19
+ if input_text.startswith("<|startoftext|>") == False:
20
+ input_text ="<|startoftext|>" + input_text
21
+ generated_text = text_generator(input_text,
22
+ max_length=max_length,
23
+ top_k=top_k,
24
+ top_p=top_p,
25
+ temperature=temperature,
26
+ do_sample=True,
27
+ repetition_penalty=2.0,
28
+ bos_token='<|startoftext|>',
29
+ eos_token='<|endoftext|>',
30
+ pad_token='<|pad|>',
31
+ unknown_token = '<|unknown|>',
32
+ num_return_sequences=1)
33
+ parsed_text = generated_text[0]["generated_text"].replace("<|startoftext|>", "").replace("\r","").replace("\n\n", "\n").replace("\t", " ").replace("<|pad|>", " * ").replace("\"\"", "\"")
34
+ return parsed_text
35
+ gr.Interface(
36
+ text_generation,
37
+ inputs=gr.Textbox(lines=1, label="הזינו פה ציטוט פתיחה, או השאירו ריק. מה שבא לכם", elem_id="input_text"),
38
+ outputs=gr.Textbox(type="auto", label="פה מופיע הטקסט שהמחולל יוצר", elem_id="output_text"),
39
+ css="#output_text{direction: rtl} #input_text{direction: rtl}",
40
+ title=title,
41
+ description=description,
42
+ article=article,
43
+ theme="default",
44
+ allow_flagging=False,
45
+ ).launch()
model/added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 50257,
3
+ "<|pad|>": 50260,
4
+ "<|startoftext|>": 50258,
5
+ "<|unknown|>": 50259
6
+ }
model/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Norod78/hebrew-gpt_neo-small",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPTNeoForCausalLM"
6
+ ],
7
+ "attention_dropout": 0,
8
+ "attention_layers": [
9
+ "global",
10
+ "global",
11
+ "global",
12
+ "global",
13
+ "global",
14
+ "global",
15
+ "global",
16
+ "global",
17
+ "global",
18
+ "global",
19
+ "global",
20
+ "global"
21
+ ],
22
+ "attention_types": [
23
+ [
24
+ [
25
+ "global"
26
+ ],
27
+ 12
28
+ ]
29
+ ],
30
+ "bos_token_id": 50256,
31
+ "embed_dropout": 0,
32
+ "eos_token_id": 50256,
33
+ "gradient_checkpointing": false,
34
+ "hidden_size": 768,
35
+ "initializer_range": 0.02,
36
+ "intermediate_size": null,
37
+ "layer_norm_epsilon": 1e-05,
38
+ "max_position_embeddings": 2048,
39
+ "model_type": "gpt_neo",
40
+ "num_heads": 12,
41
+ "num_layers": 12,
42
+ "pad_token_id": 50256,
43
+ "resid_dropout": 0,
44
+ "summary_activation": null,
45
+ "summary_first_dropout": 0.1,
46
+ "summary_proj_to_labels": true,
47
+ "summary_type": "cls_index",
48
+ "summary_use_proj": true,
49
+ "torch_dtype": "float32",
50
+ "transformers_version": "4.20.1",
51
+ "use_cache": true,
52
+ "vocab_size": 50261,
53
+ "window_size": 256
54
+ }
model/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d44ea32f47ad8ed6713087424ff7f165616f6c2c7f5b35e3b1a872a867ed6a6
3
+ size 551197393
model/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|startoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|pad|>",
5
+ "unk_token": "<unk>"
6
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": "<|startoftext|>",
5
+ "do_lower_case": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "full_tokenizer_file": null,
9
+ "max_len": 1024,
10
+ "name_or_path": "Norod78/hebrew-gpt_neo-small",
11
+ "pad_token": "<|pad|>",
12
+ "special_tokens_map_file": "special_tokens_map.json",
13
+ "tokenizer_class": "GPT2Tokenizer",
14
+ "unk_token": {
15
+ "__type": "AddedToken",
16
+ "content": "<|endoftext|>",
17
+ "lstrip": false,
18
+ "normalized": true,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "unknown_token": "<|unknown|>"
23
+ }
model/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ tokenizers
5
+