UNIST-Eunchan commited on
Commit
944d9b4
β€’
1 Parent(s): e59f863

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -32
app.py CHANGED
@@ -32,7 +32,45 @@ def infer(input_ids, max_length, temperature, top_k, top_p):
32
 
33
  return output_sequences
34
 
35
- default_value = test_book[0]['book']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  '''
38
  '''
@@ -41,40 +79,47 @@ default_value = test_book[0]['book']
41
  st.title("Book Summarization πŸ“š")
42
  st.write("The almighty king of text generation, GPT-2 comes in four available sizes, only three of which have been publicly made available. Feared for its fake news generation capabilities, it currently stands as the most syntactically coherent model. A direct successor to the original GPT, it reinforces the already established pre-training/fine-tuning killer duo. From the paper: Language Models are Unsupervised Multitask Learners by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.")
43
 
44
- sent = st.text_area("Text", default_value, height = 550)
 
 
 
 
 
45
  max_length = st.sidebar.slider("Max Length", value = 512,min_value = 10, max_value=1024)
46
  temperature = st.sidebar.slider("Temperature", value = 1.0, min_value = 0.0, max_value=1.0, step=0.05)
47
  top_k = st.sidebar.slider("Top-k", min_value = 0, max_value=5, value = 0)
48
  top_p = st.sidebar.slider("Top-p", min_value = 0.0, max_value=1.0, step = 0.05, value = 0.92)
49
 
50
- encoded_prompt = tokenizer.encode(sent, add_special_tokens=False, return_tensors="pt")
51
- if encoded_prompt.size()[-1] == 0:
52
- input_ids = None
53
- else:
54
- input_ids = encoded_prompt
55
-
56
-
57
- output_sequences = infer(input_ids, max_length, temperature, top_k, top_p)
58
-
59
-
60
-
61
- for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
62
- print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
63
- generated_sequences = generated_sequence.tolist()
64
-
65
- # Decode text
66
- text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
67
-
68
- # Remove all text after the stop token
69
- #text = text[: text.find(args.stop_token) if args.stop_token else None]
70
-
71
- # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
72
- total_sequence = (
73
- sent + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
74
- )
75
 
76
- generated_sequences.append(total_sequence)
77
- print(total_sequence)
78
-
79
-
80
- st.write(generated_sequences[-1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  return output_sequences
34
 
35
+
36
+ def chunking(book_text):
37
+ segments = []
38
+ #sentences, token_lens
39
+ current_segment = ""
40
+ total_token_lens = 0
41
+ for i in range(len(sentences)):
42
+
43
+ if total_token_lens < 512:
44
+ total_token_lens += token_lens[i]
45
+ current_segment += (sentences[i] + " ")
46
+
47
+ elif total_token_lens > 768:
48
+ segments.append(current_segment)
49
+ current_segment = sentences[i]
50
+ total_token_lens = token_lens[i]
51
+
52
+ else:
53
+ #make next_pseudo_segment
54
+ next_pseudo_segment = ""
55
+ next_token_len = 0
56
+ for t in range(30):
57
+ if (i+t < len(sentences)) and (next_token_len + token_lens[i+t] < 512):
58
+ next_token_len += token_lens[i+t]
59
+ next_pseudo_segment += sentences[i+t]
60
+
61
+ embs = model.encode([current_segment, next_pseudo_segment, sentences[i]]) # current, next, sent
62
+ if cos_similarity(embs[1],embs[2]) > cos_similarity(embs[0],embs[2]):
63
+ segments.append(current_segment)
64
+ current_segment = sentences[i]
65
+ total_token_lens = token_lens[i]
66
+ else:
67
+ total_token_lens += token_lens[i]
68
+ current_segment += (sentences[i] + " ")
69
+
70
+ return segments
71
+
72
+
73
+ chunked_segments = chunking(test_book[0]['book'])
74
 
75
  '''
76
  '''
 
79
  st.title("Book Summarization πŸ“š")
80
  st.write("The almighty king of text generation, GPT-2 comes in four available sizes, only three of which have been publicly made available. Feared for its fake news generation capabilities, it currently stands as the most syntactically coherent model. A direct successor to the original GPT, it reinforces the already established pre-training/fine-tuning killer duo. From the paper: Language Models are Unsupervised Multitask Learners by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.")
81
 
82
+ book_index = st.sidebar.slider("Select Book Example", value = 0,min_value = 0, max_value=4)
83
+
84
+ _book = test_book[book_index]['book']
85
+ chunked_segments = chunking(_book)
86
+
87
+ sent = st.text_area("Text", _book, height = 550)
88
  max_length = st.sidebar.slider("Max Length", value = 512,min_value = 10, max_value=1024)
89
  temperature = st.sidebar.slider("Temperature", value = 1.0, min_value = 0.0, max_value=1.0, step=0.05)
90
  top_k = st.sidebar.slider("Top-k", min_value = 0, max_value=5, value = 0)
91
  top_p = st.sidebar.slider("Top-p", min_value = 0.0, max_value=1.0, step = 0.05, value = 0.92)
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ for segment in range(len(chunked_segments)):
95
+
96
+ encoded_prompt = tokenizer.encode(segment, add_special_tokens=False, return_tensors="pt")
97
+ if encoded_prompt.size()[-1] == 0:
98
+ input_ids = None
99
+ else:
100
+ input_ids = encoded_prompt
101
+
102
+
103
+ output_sequences = infer(input_ids, max_length, temperature, top_k, top_p)
104
+
105
+
106
+ for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
107
+ print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
108
+ generated_sequences = generated_sequence.tolist()
109
+
110
+ # Decode text
111
+ text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
112
+
113
+ # Remove all text after the stop token
114
+ #text = text[: text.find(args.stop_token) if args.stop_token else None]
115
+
116
+ # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
117
+ total_sequence = (
118
+ sent + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
119
+ )
120
+
121
+ generated_sequences.append(total_sequence)
122
+ print(total_sequence)
123
+
124
+
125
+ st.write(generated_sequences[-1])