Spaces:

limitedonly41
/

website_topic

Runtime error

App Files Files Community

website_topic / app.py

limitedonly41

Update app.py

59fb33a verified 4 months ago

raw

history blame contribute delete

4.92 kB

	import gradio as gr
	import asyncio
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	from tqdm import tqdm
	import urllib
	from deep_translator import GoogleTranslator
	from unsloth import FastLanguageModel
	import torch
	import re



	# Define helper functions
	async def fetch_data(url):
	headers = {
	'Accept': '/',
	'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
	'Connection': 'keep-alive',
	'Referer': f'{url}',
	'Sec-Fetch-Dest': 'empty',
	'Sec-Fetch-Mode': 'cors',
	'Sec-Fetch-Site': 'cross-site',
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
	'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
	'sec-ch-ua-mobile': '?0',
	'sec-ch-ua-platform': '"macOS"',
	}

	encoding = 'utf-8'
	timeout = 10

	try:
	def get_content():
	req = urllib.request.Request(url, headers=headers)
	with urllib.request.urlopen(req, timeout=timeout) as response:
	return response.read()

	response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)

	soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)

	title = soup.find('title').text
	description = soup.find('meta', attrs={'name': 'description'})
	if description and "content" in description.attrs:
	description = description.get("content")
	else:
	description = ""

	keywords = soup.find('meta', attrs={'name': 'keywords'})
	if keywords and "content" in keywords.attrs:
	keywords = keywords.get("content")
	else:
	keywords = ""

	h1_all = " ".join(h.text for h in soup.find_all('h1'))
	h2_all = " ".join(h.text for h in soup.find_all('h2'))
	h3_all = " ".join(h.text for h in soup.find_all('h3'))
	paragraphs_all = " ".join(p.text for p in soup.find_all('p'))

	allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
	allthecontent = allthecontent[:4999]

	return {
	'url': url,
	'title': title,
	'description': description,
	'keywords': keywords,
	'h1': h1_all,
	'h2': h2_all,
	'h3': h3_all,
	'paragraphs': paragraphs_all,
	'text': allthecontent
	}
	except Exception as e:
	return {
	'url': url,
	'title': None,
	'description': None,
	'keywords': None,
	'h1': None,
	'h2': None,
	'h3': None,
	'paragraphs': None,
	'text': None
	}

	def concatenate_text(data):
	text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
	text = ' '.join(text_parts)
	text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
	text = re.sub(r'\s{2,}', ' ', text)
	return text

	def translate_text(text):
	try:
	text = text[:4990]
	translated_text = GoogleTranslator(source='auto', target='en').translate(text)
	return translated_text
	except Exception as e:
	print(f"An error occurred during translation: {e}")
	return None

	@spaces.GPU()
	def summarize_url(url):

	# Load the model
	max_seq_length = 2048
	dtype = None
	load_in_4bit = True

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit,
	)

	# Enable native 2x faster inference
	FastLanguageModel.for_inference(model)

	result = asyncio.run(fetch_data(url))
	text = concatenate_text(result)
	translated_text = translate_text(text)

	alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	Describe the website text into one word topic:

	### Input:
	{}

	### Response:
	"""

	prompt = alpaca_prompt.format(translated_text)
	inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

	outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
	summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
	final_answer = summary.split("### Response:")[1].strip()
	return final_answer

	# Define Gradio interface
	iface = gr.Interface(
	fn=summarize_url,
	inputs="text",
	outputs="text",
	title="Website Summary Generator",
	description="Enter a URL to get a one-word topic summary of the website content."
	)

	# Launch the Gradio app
	iface.launch()