import gradio as gr from sentence_transformers import SentenceTransformer, util import pandas as pd import chardet import io # Load embedding model model = SentenceTransformer('all-mpnet-base-v2') def read_csv_terms(file_bytes): if file_bytes is None: return [] result = chardet.detect(file_bytes) encoding = result['encoding'] df = pd.read_csv(io.BytesIO(file_bytes), encoding=encoding) keyword_col = None for possible_col in ['Keyword', 'keyword', 'Term', 'term', 'Keywords', 'keywords']: if possible_col in df.columns: keyword_col = possible_col break if keyword_col is None: raise ValueError("Couldn't find a keyword column in CSV file.") return df[keyword_col].dropna().unique().tolist() def rank_terms(main_keyword, terms_text, csv_file): terms_list = [] if terms_text.strip(): terms_list.extend([term.strip() for term in terms_text.strip().split('\n') if term.strip()]) if csv_file: try: csv_terms = read_csv_terms(csv_file) terms_list.extend(csv_terms) except Exception as e: return pd.DataFrame([[f"CSV error: {str(e)}", "", ""]], columns=["Term", "Similarity", "Distance"]) terms_list = list(set(terms_list)) if len(terms_list) > 1000: return pd.DataFrame([["Error: More than 1000 terms provided after deduplication.", "", ""]], columns=["Term", "Similarity", "Distance"]) if not main_keyword or not terms_list: return pd.DataFrame([["Error: Provide a main keyword and at least one term.", "", ""]], columns=["Term", "Similarity", "Distance"]) keyword_embedding = model.encode(main_keyword, convert_to_tensor=True) terms_embeddings = model.encode(terms_list, convert_to_tensor=True) cosine_scores = util.cos_sim(keyword_embedding, terms_embeddings)[0] distances = 1 - cosine_scores # semantic distance = 1 - similarity paired_scores = sorted( zip(terms_list, cosine_scores.cpu().numpy(), distances.cpu().numpy()), key=lambda x: x[1], reverse=True ) df = pd.DataFrame(paired_scores, columns=["Term", "Similarity", "Distance"]) df["Similarity"] = df["Similarity"].round(4) df["Distance"] = df["Distance"].round(4) return df iface = gr.Interface( fn=rank_terms, inputs=[ gr.Textbox(lines=1, label="Main Keyword"), gr.Textbox(lines=10, label="List of Terms (one per line, optional)"), gr.File(label="Upload CSV (optional)", file_types=['.csv'], type='binary') ], outputs=gr.Dataframe( headers=["Term", "Similarity", "Distance"], datatype=["str", "number", "number"] ), title="Semantic Similarity & Distance Ranking Tool (with CSV Upload)", description=( "Enter a main keyword and/or terms. Optionally upload a CSV file from Ahrefs. " "Outputs terms ranked by semantic similarity and distance (1 - similarity) to your keyword." ) ) if __name__ == "__main__": iface.launch()