limitedonly41 commited on
Commit
b288f4d
1 Parent(s): d598db2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+ import urllib
8
+ from deep_translator import GoogleTranslator
9
+ from unsloth import FastLanguageModel
10
+ import torch
11
+ import re
12
+
13
+ # Load the model
14
+ max_seq_length = 2048
15
+ dtype = None
16
+ load_in_4bit = True
17
+
18
+ model, tokenizer = FastLanguageModel.from_pretrained(
19
+ model_name="unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
20
+ max_seq_length=max_seq_length,
21
+ dtype=dtype,
22
+ load_in_4bit=load_in_4bit,
23
+ )
24
+
25
+ # Enable native 2x faster inference
26
+ FastLanguageModel.for_inference(model)
27
+
28
+ # Define helper functions
29
+ async def fetch_data(url):
30
+ headers = {
31
+ 'Accept': '*/*',
32
+ 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
33
+ 'Connection': 'keep-alive',
34
+ 'Referer': f'{url}',
35
+ 'Sec-Fetch-Dest': 'empty',
36
+ 'Sec-Fetch-Mode': 'cors',
37
+ 'Sec-Fetch-Site': 'cross-site',
38
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
39
+ 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
40
+ 'sec-ch-ua-mobile': '?0',
41
+ 'sec-ch-ua-platform': '"macOS"',
42
+ }
43
+
44
+ encoding = 'utf-8'
45
+ timeout = 10
46
+
47
+ try:
48
+ def get_content():
49
+ req = urllib.request.Request(url, headers=headers)
50
+ with urllib.request.urlopen(req, timeout=timeout) as response:
51
+ return response.read()
52
+
53
+ response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)
54
+
55
+ soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
56
+
57
+ title = soup.find('title').text
58
+ description = soup.find('meta', attrs={'name': 'description'})
59
+ if description and "content" in description.attrs:
60
+ description = description.get("content")
61
+ else:
62
+ description = ""
63
+
64
+ keywords = soup.find('meta', attrs={'name': 'keywords'})
65
+ if keywords and "content" in keywords.attrs:
66
+ keywords = keywords.get("content")
67
+ else:
68
+ keywords = ""
69
+
70
+ h1_all = " ".join(h.text for h in soup.find_all('h1'))
71
+ h2_all = " ".join(h.text for h in soup.find_all('h2'))
72
+ h3_all = " ".join(h.text for h in soup.find_all('h3'))
73
+ paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
74
+
75
+ allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
76
+ allthecontent = allthecontent[:4999]
77
+
78
+ return {
79
+ 'url': url,
80
+ 'title': title,
81
+ 'description': description,
82
+ 'keywords': keywords,
83
+ 'h1': h1_all,
84
+ 'h2': h2_all,
85
+ 'h3': h3_all,
86
+ 'paragraphs': paragraphs_all,
87
+ 'text': allthecontent
88
+ }
89
+ except Exception as e:
90
+ return {
91
+ 'url': url,
92
+ 'title': None,
93
+ 'description': None,
94
+ 'keywords': None,
95
+ 'h1': None,
96
+ 'h2': None,
97
+ 'h3': None,
98
+ 'paragraphs': None,
99
+ 'text': None
100
+ }
101
+
102
+ def concatenate_text(data):
103
+ text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
104
+ text = ' '.join(text_parts)
105
+ text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
106
+ text = re.sub(r'\s{2,}', ' ', text)
107
+ return text
108
+
109
+ def translate_text(text):
110
+ try:
111
+ text = text[:4990]
112
+ translated_text = GoogleTranslator(source='auto', target='en').translate(text)
113
+ return translated_text
114
+ except Exception as e:
115
+ print(f"An error occurred during translation: {e}")
116
+ return None
117
+
118
+ def summarize_url(url):
119
+ result = asyncio.run(fetch_data(url))
120
+ text = concatenate_text(result)
121
+ translated_text = translate_text(text)
122
+
123
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
124
+
125
+ ### Instruction:
126
+ Describe the website text into one word topic:
127
+
128
+ ### Input:
129
+ {}
130
+
131
+ ### Response:
132
+ """
133
+
134
+ prompt = alpaca_prompt.format(translated_text)
135
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
136
+
137
+ outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
138
+ summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
139
+ final_answer = summary.split("### Response:")[1].strip()
140
+ return final_answer
141
+
142
+ # Define Gradio interface
143
+ iface = gr.Interface(
144
+ fn=summarize_url,
145
+ inputs="text",
146
+ outputs="text",
147
+ title="Website Summary Generator",
148
+ description="Enter a URL to get a one-word topic summary of the website content."
149
+ )
150
+
151
+ # Launch the Gradio app
152
+ iface.launch()