claytonsamples commited on
Commit
08a92ff
·
1 Parent(s): 1413056

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -4
app.py CHANGED
@@ -34,6 +34,24 @@ def finder(url, soup, media_type):
34
  files.append(file_url)
35
  return files
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def scrapper(url):
38
  try:
39
  response = requests.get(url, timeout=10)
@@ -55,10 +73,7 @@ def scrapper(url):
55
  full_text += line + ' '
56
 
57
  # Initialize the summarization pipeline
58
- summarizer = pipeline('summarization')
59
-
60
- # Summarize the content
61
- summary = summarizer(full_text, max_length=200, min_length=50, do_sample=False)
62
 
63
  # Extract the summary text
64
  summary_text = summary[0]['summary_text']
 
34
  files.append(file_url)
35
  return files
36
 
37
+ def summarize_long_text(text, chunk_size=1024):
38
+ # Initialize the summarization pipeline
39
+ summarizer = pipeline('summarization')
40
+
41
+ # Tokenize the text into words
42
+ words = text.split()
43
+
44
+ # Split the words into chunks of the specified size
45
+ chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
46
+
47
+ # Summarize each chunk
48
+ summarized_chunks = [summarizer(chunk, max_length=1024, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]
49
+
50
+ # Combine the summarized chunks into the final summary
51
+ final_summary = ' '.join(summarized_chunks)
52
+
53
+ return final_summary
54
+
55
  def scrapper(url):
56
  try:
57
  response = requests.get(url, timeout=10)
 
73
  full_text += line + ' '
74
 
75
  # Initialize the summarization pipeline
76
+ summary = summarize_long_text(full_text)
 
 
 
77
 
78
  # Extract the summary text
79
  summary_text = summary[0]['summary_text']