remzicam commited on
Commit
c27e127
·
1 Parent(s): 5d8a17a

Upload 2 files

Browse files

The summarizer model updated. Got error from the previous one!

Files changed (2) hide show
  1. app.py +55 -17
  2. requirements.txt +3 -0
app.py CHANGED
@@ -2,11 +2,23 @@
2
 
3
  from re import sub
4
 
5
- from gradio import Interface, Series, Textbox
6
  from requests import get
 
7
 
 
8
 
9
- def clean_text(text):
 
 
 
 
 
 
 
 
 
 
10
  """Cleans subtitle text of ted talks.
11
 
12
  Args:
@@ -26,14 +38,14 @@ def clean_text(text):
26
  return cleaned_text
27
 
28
 
29
- def ted_talk_transcriber(link):
30
  """Creates transcription of ted talks from url.
31
 
32
  Args:
33
  link (str): url link of ted talks
34
 
35
  Returns:
36
- cleaned_transcript (str): transcription of the ted talk
37
  """
38
  # request link of the talk
39
  page = get(link)
@@ -42,29 +54,55 @@ def ted_talk_transcriber(link):
42
  raw_text = get(
43
  f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt"
44
  ).text
45
- cleaned_transcript = clean_text(raw_text)
46
- return cleaned_transcript
47
 
48
 
49
- transcriber = Interface(
50
- ted_talk_transcriber,
51
- "text",
52
- "text",
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- summarizer = Interface.load(
56
- "huggingface/Shobhank-iiitdwd/long-t5-tglobal-base-16384-book-summary"
57
- )
58
 
59
  logo = "<center><img src='file/TED.png' width=180px></center>"
60
 
61
- Series(
62
- transcriber,
63
- summarizer,
64
  inputs=Textbox(label="Type the TED Talks link"),
65
  examples=[
66
  "https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body"
67
  ],
 
68
  allow_flagging="never",
69
  description=logo,
70
  ).launch()
 
2
 
3
  from re import sub
4
 
5
+ from gradio import Interface, Textbox
6
  from requests import get
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
8
 
9
+ repo_id = "pszemraj/led-base-book-summary"
10
 
11
+ model = AutoModelForSeq2SeqLM.from_pretrained(
12
+ repo_id,
13
+ low_cpu_mem_usage=True,
14
+ )
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(repo_id)
17
+
18
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
19
+
20
+
21
+ def clean_text(text: str) -> str:
22
  """Cleans subtitle text of ted talks.
23
 
24
  Args:
 
38
  return cleaned_text
39
 
40
 
41
+ def ted_talk_transcriber(link: str) -> str:
42
  """Creates transcription of ted talks from url.
43
 
44
  Args:
45
  link (str): url link of ted talks
46
 
47
  Returns:
48
+ raw_text (str): raw transcription of the ted talk
49
  """
50
  # request link of the talk
51
  page = get(link)
 
54
  raw_text = get(
55
  f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt"
56
  ).text
57
+ return raw_text
 
58
 
59
 
60
+ def text_summarizer(text: str) -> str:
61
+ """Summarizes given text.
62
+
63
+ Args:
64
+ text (str): ted talks transcription
65
+
66
+ Returns:
67
+ str: summary
68
+ """
69
+ result = summarizer(
70
+ text,
71
+ min_length=8,
72
+ max_length=256,
73
+ no_repeat_ngram_size=3,
74
+ encoder_no_repeat_ngram_size=3,
75
+ repetition_penalty=3.5,
76
+ num_beams=4,
77
+ do_sample=False,
78
+ early_stopping=True,
79
+ )
80
+ return result[0]["summary_text"]
81
+
82
+
83
+ def main(link: str) -> str:
84
+ """Summarizes ted talks given link.
85
+
86
+ Args:
87
+ link (str): url link of ted talks
88
+
89
+ Returns:
90
+ str: summary
91
+ """
92
+ raw_text = ted_talk_transcriber(link)
93
+ cleaned_transcript = clean_text(raw_text)
94
+ return text_summarizer(cleaned_transcript)
95
 
 
 
 
96
 
97
  logo = "<center><img src='file/TED.png' width=180px></center>"
98
 
99
+ Interface(
100
+ main,
 
101
  inputs=Textbox(label="Type the TED Talks link"),
102
  examples=[
103
  "https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body"
104
  ],
105
+ outputs=Textbox(label="Summary"),
106
  allow_flagging="never",
107
  description=logo,
108
  ).launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ accelerate
2
+ torch
3
+ transformers