File size: 4,924 Bytes
b288f4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59fb33a
b288f4d
59fb33a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b288f4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gradio as gr
import asyncio
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import urllib
from deep_translator import GoogleTranslator
from unsloth import FastLanguageModel
import torch
import re



# Define helper functions
async def fetch_data(url):
    headers = {
        'Accept': '*/*',
        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'Connection': 'keep-alive',
        'Referer': f'{url}',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    encoding = 'utf-8'
    timeout = 10
    
    try:
        def get_content():
            req = urllib.request.Request(url, headers=headers)
            with urllib.request.urlopen(req, timeout=timeout) as response:
                return response.read()

        response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)

        soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)

        title = soup.find('title').text
        description = soup.find('meta', attrs={'name': 'description'})
        if description and "content" in description.attrs:
            description = description.get("content")
        else:
            description = ""

        keywords = soup.find('meta', attrs={'name': 'keywords'})
        if keywords and "content" in keywords.attrs:
            keywords = keywords.get("content")
        else:
            keywords = ""

        h1_all = " ".join(h.text for h in soup.find_all('h1'))
        h2_all = " ".join(h.text for h in soup.find_all('h2'))
        h3_all = " ".join(h.text for h in soup.find_all('h3'))
        paragraphs_all = " ".join(p.text for p in soup.find_all('p'))

        allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
        allthecontent = allthecontent[:4999]

        return {
            'url': url,
            'title': title,
            'description': description,
            'keywords': keywords,
            'h1': h1_all,
            'h2': h2_all,
            'h3': h3_all,
            'paragraphs': paragraphs_all,
            'text': allthecontent
        }
    except Exception as e:
        return {
            'url': url,
            'title': None,
            'description': None,
            'keywords': None,
            'h1': None,
            'h2': None,
            'h3': None,
            'paragraphs': None,
            'text': None
        }

def concatenate_text(data):
    text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
    text = ' '.join(text_parts)
    text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
    text = re.sub(r'\s{2,}', ' ', text)
    return text

def translate_text(text):
    try:
        text = text[:4990]
        translated_text = GoogleTranslator(source='auto', target='en').translate(text)
        return translated_text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return None

@spaces.GPU()
def summarize_url(url):

    # Load the model
    max_seq_length = 2048
    dtype = None
    load_in_4bit = True
    
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    
    # Enable native 2x faster inference
    FastLanguageModel.for_inference(model)
    
    result = asyncio.run(fetch_data(url))
    text = concatenate_text(result)
    translated_text = translate_text(text)
    
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    Describe the website text into one word topic:

    ### Input:
    {}

    ### Response:
    """
    
    prompt = alpaca_prompt.format(translated_text)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    final_answer = summary.split("### Response:")[1].strip()
    return final_answer

# Define Gradio interface
iface = gr.Interface(
    fn=summarize_url,
    inputs="text",
    outputs="text",
    title="Website Summary Generator",
    description="Enter a URL to get a one-word topic summary of the website content."
)

# Launch the Gradio app
iface.launch()