limitedonly41 commited on
Commit
2891c11
·
verified ·
1 Parent(s): c885544

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -107
app.py CHANGED
@@ -1,22 +1,28 @@
1
  import gradio as gr
2
- import asyncio
3
- import requests
4
- from bs4 import BeautifulSoup
 
5
  import pandas as pd
6
  from tqdm import tqdm
7
  import urllib
8
- from deep_translator import GoogleTranslator
9
- import spaces
10
-
11
 
12
- # from unsloth import FastLanguageModel
13
- import torch
14
- import re
15
 
 
 
 
 
16
 
 
 
 
 
 
17
 
18
- # Define helper functions
19
- async def fetch_data(url):
20
  headers = {
21
  'Accept': '*/*',
22
  'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
@@ -32,38 +38,33 @@ async def fetch_data(url):
32
  }
33
 
34
  encoding = 'utf-8'
35
- timeout = 10
36
-
37
  try:
38
- def get_content():
39
- req = urllib.request.Request(url, headers=headers)
40
- with urllib.request.urlopen(req, timeout=timeout) as response:
41
- return response.read()
42
-
43
- response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)
44
 
45
  soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
46
 
47
  title = soup.find('title').text
48
  description = soup.find('meta', attrs={'name': 'description'})
49
- if description and "content" in description.attrs:
50
- description = description.get("content")
51
- else:
52
- description = ""
53
 
54
  keywords = soup.find('meta', attrs={'name': 'keywords'})
55
- if keywords and "content" in keywords.attrs:
56
- keywords = keywords.get("content")
57
- else:
58
- keywords = ""
59
 
60
- h1_all = " ".join(h.text for h in soup.find_all('h1'))
61
- h2_all = " ".join(h.text for h in soup.find_all('h2'))
62
- h3_all = " ".join(h.text for h in soup.find_all('h3'))
63
- paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
64
 
65
- allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
66
- allthecontent = allthecontent[:4999]
 
 
 
 
67
 
68
  return {
69
  'url': url,
@@ -77,6 +78,7 @@ async def fetch_data(url):
77
  'text': allthecontent
78
  }
79
  except Exception as e:
 
80
  return {
81
  'url': url,
82
  'title': None,
@@ -89,45 +91,25 @@ async def fetch_data(url):
89
  'text': None
90
  }
91
 
92
- def concatenate_text(data):
93
- text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
94
- text = ' '.join(text_parts)
95
- text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
96
- text = re.sub(r'\s{2,}', ' ', text)
97
- return text
98
-
99
- def translate_text(text):
100
- try:
101
- text = text[:4990]
102
- translated_text = GoogleTranslator(source='auto', target='en').translate(text)
103
- return translated_text
104
- except Exception as e:
105
- print(f"An error occurred during translation: {e}")
106
- return None
107
 
108
 
109
- model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
110
-
111
- # Initialize model and tokenizer variables
112
- model = None
113
- tokenizer = None
114
-
115
  @spaces.GPU()
116
- def summarize_url(url):
 
117
 
118
  global model, tokenizer # Declare model and tokenizer as global variables
119
 
120
- # Load the model
121
- max_seq_length = 2048
122
- dtype = None
123
- load_in_4bit = True
124
-
125
  if model is None or tokenizer is None:
126
- from unsloth import FastLanguageModel
127
-
128
- # Load the model and tokenizer
129
  model, tokenizer = FastLanguageModel.from_pretrained(
130
- model_name=model_name, # YOUR MODEL YOU USED FOR TRAINING
131
  max_seq_length=max_seq_length,
132
  dtype=dtype,
133
  load_in_4bit=load_in_4bit,
@@ -135,11 +117,14 @@ def summarize_url(url):
135
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
136
 
137
 
138
- result = asyncio.run(fetch_data(url))
139
- text = concatenate_text(result)
140
- translated_text = translate_text(text)
141
- if len(translated_text) < 100:
142
- return 'not scraped or short text'
 
 
 
143
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
144
 
145
  ### Instruction:
@@ -151,7 +136,7 @@ def summarize_url(url):
151
  ### Response:
152
  """
153
 
154
- prompt = alpaca_prompt.format(translated_text)
155
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
156
 
157
  outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
@@ -159,55 +144,229 @@ def summarize_url(url):
159
  final_answer = summary.split("### Response:")[1].strip()
160
  return final_answer
161
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- # # Create the Gradio interface within a `Blocks` context, like the working example
164
- # with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # # Add title and description to the interface
167
- # gr.HTML("<h1>Website Summary Generator</h1>")
168
- # gr.HTML("<p>Enter a URL to get a one-word topic summary of the website content..</p>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- # # Define input and output elements
171
- # with gr.Row():
172
- # prompt = gr.Textbox(label="Enter Website URL", placeholder="https://example.com")
173
- # output_text = gr.Textbox(label="Topic", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- # # Add the button to trigger the function
176
- # submit = gr.Button("Classify")
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- # # Define the interaction between inputs and outputs
179
- # submit.click(fn=summarize_url, inputs=prompt, outputs=output_text)
 
 
 
 
 
 
 
 
180
 
181
- # # Add the `if __name__ == "__main__":` block to launch the interface
182
- # if __name__ == "__main__":
183
- # demo.launch()
184
 
185
 
186
- # with gr as demo:
187
- # # Define Gradio interface
188
- # demo = demo.Interface(
189
- # fn=summarize_url,
190
- # inputs="text",
191
- # outputs="text",
192
- # title="Website Summary Generator",
193
- # description="Enter a URL to get a one-word topic summary of the website content."
194
- # )
195
 
196
 
197
- # if __name__ == "__main__":
198
- # demo.launch()
199
 
200
 
201
 
202
- # Create a Gradio interface
203
- iface = gr.Interface(
204
- fn=summarize_url,
205
- inputs="text",
206
- outputs="text",
207
- title="Website Summary Generator",
208
- description="Enter a URL to get a one-word topic summary of the website content."
209
- )
210
 
211
- # Launch the interface
212
- iface.launch()
213
 
 
1
  import gradio as gr
2
+ import torch
3
+ import spaces
4
+ import logging
5
+ from deep_translator import GoogleTranslator
6
  import pandas as pd
7
  from tqdm import tqdm
8
  import urllib
9
+ from bs4 import BeautifulSoup
 
 
10
 
11
+ # Configure logging to write messages to a file
12
+ logging.basicConfig(filename='app.log', level=logging.ERROR)
 
13
 
14
+ # Configuration
15
+ max_seq_length = 2048
16
+ dtype = None # Auto detection of dtype
17
+ load_in_4bit = True # Use 4-bit quantization to reduce memory usage
18
 
19
+ # peft_model_name = "limitedonly41/website_qwen2_7b_2"
20
+ peft_model_name = "limitedonly41/website_mistral7b_v02"
21
+ # Initialize model and tokenizer variables
22
+ model = None
23
+ tokenizer = None
24
 
25
+ def fetch_data(url):
 
26
  headers = {
27
  'Accept': '*/*',
28
  'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
 
38
  }
39
 
40
  encoding = 'utf-8'
41
+ timeout = 10 # Set your desired timeout value in seconds
 
42
  try:
43
+ # Make the request using urllib
44
+ req = urllib.request.Request(url, headers=headers)
45
+ with urllib.request.urlopen(req, timeout=timeout) as response:
46
+ response_content = response.read()
 
 
47
 
48
  soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
49
 
50
  title = soup.find('title').text
51
  description = soup.find('meta', attrs={'name': 'description'})
52
+ description = description.get("content") if description and "content" in description.attrs else ""
 
 
 
53
 
54
  keywords = soup.find('meta', attrs={'name': 'keywords'})
55
+ keywords = keywords.get("content") if keywords and "content" in keywords.attrs else ""
 
 
 
56
 
57
+ h1_all = ". ".join(h.text for h in soup.find_all('h1'))
58
+ paragraphs_all = ". ".join(p.text for p in soup.find_all('p'))
59
+ h2_all = ". ".join(h.text for h in soup.find_all('h2'))
60
+ h3_all = ". ".join(h.text for h in soup.find_all('h3'))
61
 
62
+ allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"[:4999]
63
+
64
+ # Clean up the text
65
+ h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
66
+ h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
67
+ h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
68
 
69
  return {
70
  'url': url,
 
78
  'text': allthecontent
79
  }
80
  except Exception as e:
81
+ print(url, e)
82
  return {
83
  'url': url,
84
  'title': None,
 
91
  'text': None
92
  }
93
 
94
+ def main(urls):
95
+ results = []
96
+ for url in tqdm(urls):
97
+ result = fetch_data(url)
98
+ results.append(result)
99
+ return results
 
 
 
 
 
 
 
 
 
100
 
101
 
 
 
 
 
 
 
102
  @spaces.GPU()
103
+ def classify_website(url):
104
+ from unsloth import FastLanguageModel # Import moved to the top for model loading
105
 
106
  global model, tokenizer # Declare model and tokenizer as global variables
107
 
 
 
 
 
 
108
  if model is None or tokenizer is None:
109
+
110
+ # Load the model and tokenizer during initialization (in the main process)
 
111
  model, tokenizer = FastLanguageModel.from_pretrained(
112
+ model_name=peft_model_name,
113
  max_seq_length=max_seq_length,
114
  dtype=dtype,
115
  load_in_4bit=load_in_4bit,
 
117
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
118
 
119
 
120
+ urls = [url]
121
+ results_shop = main(urls)
122
+
123
+ # Convert results to DataFrame
124
+ df_result_train_more = pd.DataFrame(results_shop)
125
+ text = df_result_train_more['text'][0]
126
+ translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
127
+
128
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
129
 
130
  ### Instruction:
 
136
  ### Response:
137
  """
138
 
139
+ prompt = alpaca_prompt.format(translated)
140
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
141
 
142
  outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
 
144
  final_answer = summary.split("### Response:")[1].strip()
145
  return final_answer
146
 
147
+ # Create a Gradio interface
148
+ iface = gr.Interface(
149
+ fn=classify_website,
150
+ inputs="text",
151
+ outputs="text",
152
+ title="Website Topic",
153
+ description="Enter a URL to get a topic summary of the website content."
154
+ )
155
+
156
+ # Launch the interface
157
+ iface.launch()
158
+
159
 
160
+ # import gradio as gr
161
+ # import asyncio
162
+ # import requests
163
+ # from bs4 import BeautifulSoup
164
+ # import pandas as pd
165
+ # from tqdm import tqdm
166
+ # import urllib
167
+ # from deep_translator import GoogleTranslator
168
+ # import spaces
169
+
170
+
171
+ # # from unsloth import FastLanguageModel
172
+ # import torch
173
+ # import re
174
+
175
+
176
+
177
+ # # Define helper functions
178
+ # async def fetch_data(url):
179
+ # headers = {
180
+ # 'Accept': '*/*',
181
+ # 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
182
+ # 'Connection': 'keep-alive',
183
+ # 'Referer': f'{url}',
184
+ # 'Sec-Fetch-Dest': 'empty',
185
+ # 'Sec-Fetch-Mode': 'cors',
186
+ # 'Sec-Fetch-Site': 'cross-site',
187
+ # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
188
+ # 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
189
+ # 'sec-ch-ua-mobile': '?0',
190
+ # 'sec-ch-ua-platform': '"macOS"',
191
+ # }
192
+
193
+ # encoding = 'utf-8'
194
+ # timeout = 10
195
 
196
+ # try:
197
+ # def get_content():
198
+ # req = urllib.request.Request(url, headers=headers)
199
+ # with urllib.request.urlopen(req, timeout=timeout) as response:
200
+ # return response.read()
201
+
202
+ # response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)
203
+
204
+ # soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
205
+
206
+ # title = soup.find('title').text
207
+ # description = soup.find('meta', attrs={'name': 'description'})
208
+ # if description and "content" in description.attrs:
209
+ # description = description.get("content")
210
+ # else:
211
+ # description = ""
212
+
213
+ # keywords = soup.find('meta', attrs={'name': 'keywords'})
214
+ # if keywords and "content" in keywords.attrs:
215
+ # keywords = keywords.get("content")
216
+ # else:
217
+ # keywords = ""
218
+
219
+ # h1_all = " ".join(h.text for h in soup.find_all('h1'))
220
+ # h2_all = " ".join(h.text for h in soup.find_all('h2'))
221
+ # h3_all = " ".join(h.text for h in soup.find_all('h3'))
222
+ # paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
223
+
224
+ # allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
225
+ # allthecontent = allthecontent[:4999]
226
+
227
+ # return {
228
+ # 'url': url,
229
+ # 'title': title,
230
+ # 'description': description,
231
+ # 'keywords': keywords,
232
+ # 'h1': h1_all,
233
+ # 'h2': h2_all,
234
+ # 'h3': h3_all,
235
+ # 'paragraphs': paragraphs_all,
236
+ # 'text': allthecontent
237
+ # }
238
+ # except Exception as e:
239
+ # return {
240
+ # 'url': url,
241
+ # 'title': None,
242
+ # 'description': None,
243
+ # 'keywords': None,
244
+ # 'h1': None,
245
+ # 'h2': None,
246
+ # 'h3': None,
247
+ # 'paragraphs': None,
248
+ # 'text': None
249
+ # }
250
+
251
+ # def concatenate_text(data):
252
+ # text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
253
+ # text = ' '.join(text_parts)
254
+ # text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
255
+ # text = re.sub(r'\s{2,}', ' ', text)
256
+ # return text
257
+
258
+ # def translate_text(text):
259
+ # try:
260
+ # text = text[:4990]
261
+ # translated_text = GoogleTranslator(source='auto', target='en').translate(text)
262
+ # return translated_text
263
+ # except Exception as e:
264
+ # print(f"An error occurred during translation: {e}")
265
+ # return None
266
+
267
+
268
+ # model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
269
+
270
+ # # Initialize model and tokenizer variables
271
+ # model = None
272
+ # tokenizer = None
273
+
274
+ # @spaces.GPU()
275
+ # def summarize_url(url):
276
+
277
+ # global model, tokenizer # Declare model and tokenizer as global variables
278
+
279
+ # # Load the model
280
+ # max_seq_length = 2048
281
+ # dtype = None
282
+ # load_in_4bit = True
283
+
284
+ # if model is None or tokenizer is None:
285
+ # from unsloth import FastLanguageModel
286
+
287
+ # # Load the model and tokenizer
288
+ # model, tokenizer = FastLanguageModel.from_pretrained(
289
+ # model_name=model_name, # YOUR MODEL YOU USED FOR TRAINING
290
+ # max_seq_length=max_seq_length,
291
+ # dtype=dtype,
292
+ # load_in_4bit=load_in_4bit,
293
+ # )
294
+ # FastLanguageModel.for_inference(model) # Enable native 2x faster inference
295
+
296
 
297
+ # result = asyncio.run(fetch_data(url))
298
+ # text = concatenate_text(result)
299
+ # translated_text = translate_text(text)
300
+ # if len(translated_text) < 100:
301
+ # return 'not scraped or short text'
302
+ # alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
303
+
304
+ # ### Instruction:
305
+ # Describe the website text into one word topic:
306
+
307
+ # ### Input:
308
+ # {}
309
+
310
+ # ### Response:
311
+ # """
312
 
313
+ # prompt = alpaca_prompt.format(translated_text)
314
+ # inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
315
+
316
+ # outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
317
+ # summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
318
+ # final_answer = summary.split("### Response:")[1].strip()
319
+ # return final_answer
320
+
321
+
322
+ # # # Create the Gradio interface within a `Blocks` context, like the working example
323
+ # # with gr.Blocks() as demo:
324
+
325
+ # # # Add title and description to the interface
326
+ # # gr.HTML("<h1>Website Summary Generator</h1>")
327
+ # # gr.HTML("<p>Enter a URL to get a one-word topic summary of the website content..</p>")
328
 
329
+ # # # Define input and output elements
330
+ # # with gr.Row():
331
+ # # prompt = gr.Textbox(label="Enter Website URL", placeholder="https://example.com")
332
+ # # output_text = gr.Textbox(label="Topic", interactive=False)
333
+
334
+ # # # Add the button to trigger the function
335
+ # # submit = gr.Button("Classify")
336
+
337
+ # # # Define the interaction between inputs and outputs
338
+ # # submit.click(fn=summarize_url, inputs=prompt, outputs=output_text)
339
 
340
+ # # # Add the `if __name__ == "__main__":` block to launch the interface
341
+ # # if __name__ == "__main__":
342
+ # # demo.launch()
343
 
344
 
345
+ # # with gr as demo:
346
+ # # # Define Gradio interface
347
+ # # demo = demo.Interface(
348
+ # # fn=summarize_url,
349
+ # # inputs="text",
350
+ # # outputs="text",
351
+ # # title="Website Summary Generator",
352
+ # # description="Enter a URL to get a one-word topic summary of the website content."
353
+ # # )
354
 
355
 
356
+ # # if __name__ == "__main__":
357
+ # # demo.launch()
358
 
359
 
360
 
361
+ # # Create a Gradio interface
362
+ # iface = gr.Interface(
363
+ # fn=summarize_url,
364
+ # inputs="text",
365
+ # outputs="text",
366
+ # title="Website Summary Generator",
367
+ # description="Enter a URL to get a one-word topic summary of the website content."
368
+ # )
369
 
370
+ # # Launch the interface
371
+ # iface.launch()
372