shamim237 commited on
Commit
ffcb423
1 Parent(s): 774e2fe

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. app.py +17 -0
  3. chromedriver.exe +3 -0
  4. paraphrase.py +45 -0
  5. requirements.txt +4 -0
  6. scrap.py +17 -0
  7. summary.py +13 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ chromedriver.exe filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from scrap import extract
3
+ from paraphrase import para
4
+ from summary import summarize
5
+
6
+
7
+ st.title("Let's Summarize!")
8
+ link = st.text_input("Enter a product link from amazon....")
9
+ print(link)
10
+ def process():
11
+ data = extract(link)
12
+ #print(data)
13
+ paras = para(data)
14
+ summ = summarize(paras)
15
+ st.success(summ)
16
+ st.button('Extract', on_click=process)
17
+ st.text("Here is the product description...")
chromedriver.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93af100505b192263d8dba3b9d735e8ba803ce58c45f0b1bee9efe53a3ec831b
3
+ size 12358144
paraphrase.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+
5
+
6
+ def para(paragraph):
7
+ model = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
8
+ tokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ model = model.to(device)
11
+
12
+ sen = []
13
+ for i in paragraph:
14
+ res = len(re.findall(r'\w+', i))
15
+ if res == 2:
16
+ pass
17
+ else:
18
+ res = i.replace('"', "'").replace("\n", "")
19
+ sen.append(res)
20
+
21
+ para = []
22
+ for sentence in sen:
23
+ text = "paraphrase: " + sentence + " </s>"
24
+
25
+ encoding = tokenizer.encode_plus(text,max_length =1024, padding=True, return_tensors="pt")
26
+ input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
27
+
28
+ model.eval()
29
+ beam_outputs = model.generate(
30
+ input_ids=input_ids,attention_mask=attention_mask,
31
+ max_length=1024,
32
+ early_stopping=True,
33
+ num_beams=15,
34
+ num_return_sequences=3)
35
+
36
+ #for beam_output in beam_outputs:
37
+ sent = tokenizer.decode(beam_outputs[2], skip_special_tokens=True,clean_up_tokenization_spaces=True)
38
+ para.append(sent)
39
+ paras = []
40
+ for i in para:
41
+ resf = i.replace("paraphrasedoutput: ", "")
42
+ paras.append(resf)
43
+ return paras
44
+
45
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ selenium==4.8.0
2
+ sentencepiece==0.1.97
3
+ torch==1.13.1
4
+ transformers==4.25.1
scrap.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from selenium.webdriver import Chrome
3
+ from selenium.webdriver.common.by import By
4
+
5
+ def extract(link):
6
+ url = link
7
+ driver_path = "./chromedriver.exe"
8
+ browser = Chrome(executable_path = driver_path)
9
+ browser.get(url)
10
+ data = browser.find_element(By.ID,"aplus_feature_div")
11
+ data = data.text
12
+ data = data.split("\n")
13
+ time.sleep(2)
14
+ return data
15
+
16
+ ss = extract("https://www.amazon.com/dp/B09B9TB61G?th=1")
17
+ print(ss)
summary.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import PegasusForConditionalGeneration, AutoTokenizer
3
+
4
+ def summarize(passage):
5
+ txt = " ".join(passage)
6
+ model_name = 'google/pegasus-cnn_dailymail'
7
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
10
+ batch = tokenizer(txt, truncation=True, padding='longest', return_tensors="pt").to(device)
11
+ translated = model.generate(**batch)
12
+ summy = tokenizer.batch_decode(translated, skip_special_tokens=True)
13
+ return summy