datttaa / app.py
ramalMr's picture
Update app.py
b51bb0e verified
raw
history blame contribute delete
No virus
4.59 kB
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO
import csv
import os
import io
import tempfile
import re
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
def translate_to_english(text, source_lang):
encoded_input = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id("en"))
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
return translated_text
def translate_to_azerbaijani(text):
encoded_input = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id("az"))
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
return translated_text
def extract_text_from_excel(file):
df = pd.read_excel(file)
text = ' '.join(df['Unnamed: 1'].astype(str))
source_lang = "az" # Azerbaijani
english_text = translate_to_english(text, source_lang)
return english_text
def save_to_csv(sentence, output, filename="synthetic_data.csv"):
azerbaijani_output = translate_to_azerbaijani(output)
with open(filename, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow([sentence, azerbaijani_output])
def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
text = extract_text_from_excel(file)
sentences = text.split('.')
random.shuffle(sentences) # Shuffle sentences
with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
fieldnames = ['Original Sentence', 'Generated Sentence']
writer = csv.DictWriter(tmp, fieldnames=fieldnames)
writer.writeheader()
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
generate_kwargs = {
"temperature": temperature,
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"do_sample": True,
"seed": 42,
}
try:
stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
for _ in range(num_similar_sentences):
if not generated_sentences:
break
generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
except Exception as e:
print(f"Error generating data for sentence '{sentence}': {e}")
tmp_path = tmp.name
return tmp_path
gr.Interface(
fn=generate,
inputs=[
gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
],
outputs=gr.File(label="Synthetic Data "),
title="SDG",
description="AYE QABIL.",
allow_flagging="never",
).launch()