import gradio as gr
import torch
import spaces
import logging
from deep_translator import GoogleTranslator
import pandas as pd
from tqdm import tqdm
import urllib
from bs4 import BeautifulSoup

# Configure logging to write messages to a file
logging.basicConfig(filename='app.log', level=logging.ERROR)

# Configuration
max_seq_length = 2048
dtype = None  # Auto detection of dtype
load_in_4bit = True  # Use 4-bit quantization to reduce memory usage

# peft_model_name = "limitedonly41/website_qwen2_7b_2"
# peft_model_name = "limitedonly41/website_mistral7b_v02"
peft_model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"

# Initialize model and tokenizer variables
model = None
tokenizer = None

def fetch_data(url):
    headers = {
        'Accept': '*/*',
        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'Connection': 'keep-alive',
        'Referer': f'{url}',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    encoding = 'utf-8'
    timeout = 10  # Set your desired timeout value in seconds
    try:
        # Make the request using urllib
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=timeout) as response:
            response_content = response.read()

        soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)

        title = soup.find('title').text
        description = soup.find('meta', attrs={'name': 'description'})
        description = description.get("content") if description and "content" in description.attrs else ""

        keywords = soup.find('meta', attrs={'name': 'keywords'})
        keywords = keywords.get("content") if keywords and "content" in keywords.attrs else ""

        h1_all = ". ".join(h.text for h in soup.find_all('h1'))
        paragraphs_all = ". ".join(p.text for p in soup.find_all('p'))
        h2_all = ". ".join(h.text for h in soup.find_all('h2'))
        h3_all = ". ".join(h.text for h in soup.find_all('h3'))

        allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"[:4999]

        # Clean up the text
        h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
        h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
        h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')

        return {
            'url': url,
            'title': title,
            'description': description,
            'keywords': keywords,
            'h1': h1_all,
            'h2': h2_all,
            'h3': h3_all,
            'paragraphs': paragraphs_all,
            'text': allthecontent
        }
    except Exception as e:
        print(url, e)
        return {
            'url': url,
            'title': None,
            'description': None,
            'keywords': None,
            'h1': None,
            'h2': None,
            'h3': None,
            'paragraphs': None,
            'text': None
        }

def main(urls):
    results = []
    for url in tqdm(urls):
        result = fetch_data(url)
        results.append(result)
    return results


@spaces.GPU()
def classify_website(url):
    from unsloth import FastLanguageModel  # Import moved to the top for model loading

    global model, tokenizer  # Declare model and tokenizer as global variables

    if model is None or tokenizer is None:
    
        # Load the model and tokenizer during initialization (in the main process)
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=peft_model_name,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
        )
        FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

    
    urls = [url]
    results_shop = main(urls)

    # Convert results to DataFrame
    df_result_train_more = pd.DataFrame(results_shop)
    text = df_result_train_more['text'][0]
    translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])

    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    Describe the website text into one word topic:

    ### Input:
    {}

    ### Response:
    """
    
    prompt = alpaca_prompt.format(translated)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    final_answer = summary.split("### Response:")[1].strip()
    return final_answer

# Create a Gradio interface
iface = gr.Interface(
    fn=classify_website,
    inputs="text",
    outputs="text",
    title="Website Topic",
    description="Enter a URL to get a topic summary of the website content."
)

# Launch the interface
iface.launch()


# import gradio as gr
# import asyncio
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd
# from tqdm import tqdm
# import urllib
# from deep_translator import GoogleTranslator
# import spaces


# # from unsloth import FastLanguageModel
# import torch
# import re


# # Define helper functions
# async def fetch_data(url):
#     headers = {
#         'Accept': '*/*',
#         'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
#         'Connection': 'keep-alive',
#         'Referer': f'{url}',
#         'Sec-Fetch-Dest': 'empty',
#         'Sec-Fetch-Mode': 'cors',
#         'Sec-Fetch-Site': 'cross-site',
#         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
#         'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
#         'sec-ch-ua-mobile': '?0',
#         'sec-ch-ua-platform': '"macOS"',
#     }

#     encoding = 'utf-8'
#     timeout = 10
    
#     try:
#         def get_content():
#             req = urllib.request.Request(url, headers=headers)
#             with urllib.request.urlopen(req, timeout=timeout) as response:
#                 return response.read()

#         response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)

#         soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)

#         title = soup.find('title').text
#         description = soup.find('meta', attrs={'name': 'description'})
#         if description and "content" in description.attrs:
#             description = description.get("content")
#         else:
#             description = ""

#         keywords = soup.find('meta', attrs={'name': 'keywords'})
#         if keywords and "content" in keywords.attrs:
#             keywords = keywords.get("content")
#         else:
#             keywords = ""

#         h1_all = " ".join(h.text for h in soup.find_all('h1'))
#         h2_all = " ".join(h.text for h in soup.find_all('h2'))
#         h3_all = " ".join(h.text for h in soup.find_all('h3'))
#         paragraphs_all = " ".join(p.text for p in soup.find_all('p'))

#         allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
#         allthecontent = allthecontent[:4999]

#         return {
#             'url': url,
#             'title': title,
#             'description': description,
#             'keywords': keywords,
#             'h1': h1_all,
#             'h2': h2_all,
#             'h3': h3_all,
#             'paragraphs': paragraphs_all,
#             'text': allthecontent
#         }
#     except Exception as e:
#         return {
#             'url': url,
#             'title': None,
#             'description': None,
#             'keywords': None,
#             'h1': None,
#             'h2': None,
#             'h3': None,
#             'paragraphs': None,
#             'text': None
#         }

# def concatenate_text(data):
#     text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
#     text = ' '.join(text_parts)
#     text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
#     text = re.sub(r'\s{2,}', ' ', text)
#     return text

# def translate_text(text):
#     try:
#         text = text[:4990]
#         translated_text = GoogleTranslator(source='auto', target='en').translate(text)
#         return translated_text
#     except Exception as e:
#         print(f"An error occurred during translation: {e}")
#         return None


# model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"

# # Initialize model and tokenizer variables
# model = None
# tokenizer = None

# @spaces.GPU()
# def summarize_url(url):

#     global model, tokenizer  # Declare model and tokenizer as global variables

#     # Load the model
#     max_seq_length = 2048
#     dtype = None
#     load_in_4bit = True

#     if model is None or tokenizer is None:
#         from unsloth import FastLanguageModel
        
#         # Load the model and tokenizer
#         model, tokenizer = FastLanguageModel.from_pretrained(
#             model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
#             max_seq_length=max_seq_length,
#             dtype=dtype,
#             load_in_4bit=load_in_4bit,
#         )
#         FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

    
#     result = asyncio.run(fetch_data(url))
#     text = concatenate_text(result)
#     translated_text = translate_text(text)
#     if len(translated_text) < 100:
#         return 'not scraped or short text'
#     alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

#     ### Instruction:
#     Describe the website text into one word topic:

#     ### Input:
#     {}

#     ### Response:
#     """
    
#     prompt = alpaca_prompt.format(translated_text)
#     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

#     outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
#     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     final_answer = summary.split("### Response:")[1].strip()
#     return final_answer


# # # Create the Gradio interface within a `Blocks` context, like the working example
# # with gr.Blocks() as demo:
    
# #     # Add title and description to the interface
# #     gr.HTML("<h1>Website Summary Generator</h1>")
# #     gr.HTML("<p>Enter a URL to get a one-word topic summary of the website content..</p>")
    
# #     # Define input and output elements
# #     with gr.Row():
# #         prompt = gr.Textbox(label="Enter Website URL", placeholder="https://example.com")
# #         output_text = gr.Textbox(label="Topic", interactive=False)
    
# #     # Add the button to trigger the function
# #     submit = gr.Button("Classify")
    
# #     # Define the interaction between inputs and outputs
# #     submit.click(fn=summarize_url, inputs=prompt, outputs=output_text)

# # # Add the `if __name__ == "__main__":` block to launch the interface
# # if __name__ == "__main__":
# #     demo.launch()

    
# # with gr as demo:
# #     # Define Gradio interface
# #     demo = demo.Interface(
# #         fn=summarize_url,
# #         inputs="text",
# #         outputs="text",
# #         title="Website Summary Generator",
# #         description="Enter a URL to get a one-word topic summary of the website content."
# #     )


# # if __name__ == "__main__":
# #     demo.launch()


# # Create a Gradio interface
# iface = gr.Interface(
#     fn=summarize_url,
#     inputs="text",
#     outputs="text",
#     title="Website Summary Generator",
#     description="Enter a URL to get a one-word topic summary of the website content."
# )

# # Launch the interface
# iface.launch()