import streamlit as st import pandas as pd import io import re # Constants GITHUB_URL = "https://github.com/Sartify/STEL" POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"] def extract_table_from_markdown(markdown_text, table_start): """Extract table content from markdown text.""" lines = markdown_text.split('\n') table_content = [] capture = False for line in lines: if line.startswith(table_start): capture = True elif capture and (line.startswith('#') or line.strip() == ''): break # Stop capturing when we reach a new section or an empty line if capture: table_content.append(line) return '\n'.join(table_content) # def markdown_table_to_df(table_content): # """Convert markdown table to pandas DataFrame.""" # # Split the table content into lines # lines = table_content.split('\n') # # Extract headers # headers = [h.strip() for h in lines[0].split('|') if h.strip()] # # Extract data # data = [] # for line in lines[2:]: # Skip the header separator line # row = [cell.strip() for cell in line.split('|') if cell.strip()] # if row: # Include any non-empty row # # Pad the row with empty strings if it's shorter than the headers # padded_row = row + [''] * (len(headers) - len(row)) # data.append(padded_row[:len(headers)]) # Trim if longer than headers # # Create DataFrame # df = pd.DataFrame(data, columns=headers) # # Convert numeric columns to float # for col in df.columns: # if col not in ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka"]: # df[col] = pd.to_numeric(df[col], errors='coerce') # return df def extract_model_name(link): """Extract model name from markdown link.""" match = re.match(r'\[(.*?)\]\(.*?\)', link) return match.group(1) if match else link def markdown_table_to_df(table_content): """Convert markdown table to pandas DataFrame.""" # Split the table content into lines lines = table_content.split('\n') # Extract headers headers = [h.strip() for h in lines[0].split('|') if h.strip()] # Extract data data = [] for line in lines[2:]: # Skip the header separator line row = [cell.strip() for cell in line.split('|') if cell.strip()] if row: # Include any non-empty row # Pad the row with empty strings if it's shorter than the headers padded_row = row + [''] * (len(headers) - len(row)) data.append(padded_row[:len(headers)]) # Trim if longer than headers # Create DataFrame df = pd.DataFrame(data, columns=headers) # Process 'Model Name' column to extract plain text from markdown link if 'Model Name' in df.columns: df['Model Name'] = df['Model Name'].apply(extract_model_name) # Convert numeric columns to float and handle Dimension column for col in df.columns: if col == "Dimension": df[col] = df[col].apply(lambda x: int(x) if x.isdigit() else "") elif col not in ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka"]: df[col] = pd.to_numeric(df[col], errors='coerce') return df def setup_page(): """Set up the Streamlit page.""" st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="โšก", layout="wide") st.title("โšก Swahili Text Embeddings Leaderboard (STEL)") # st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300) st.image("https://huggingface.co/spaces/sartifyllc/Swahili-Text-Embeddings-Leaderboard/resolve/main/STEL.jpg", width=300) def display_leaderboard(df): """Display the leaderboard.""" st.header("๐Ÿ“Š Leaderboard") # Determine which non-benchmark columns are present present_non_benchmark_cols = [col for col in POSSIBLE_NON_BENCHMARK_COLS if col in df.columns] # Add filters columns_to_filter = [col for col in df.columns if col not in present_non_benchmark_cols] selected_columns = st.multiselect("Select benchmarks to display:", columns_to_filter, default=columns_to_filter) # Filter dataframe df_display = df[present_non_benchmark_cols + selected_columns] # Display dataframe st.dataframe(df_display.style.format("{:.4f}", subset=[col for col in df_display.columns if df_display[col].dtype == 'float64'])) # Download buttons csv = df_display.to_csv(index=False) st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv") def display_evaluation(): """Display the evaluation section.""" st.header("๐Ÿงช Evaluation") st.markdown(""" To evaluate a model on the Swahili Embeddings Text Benchmark, you can use the following Python script: ```python pip install mteb pip install sentence-transformers import mteb from sentence_transformers import SentenceTransformer model_name = "MultiLinguSwahili-serengeti-E250-nli-matryoshka" publisher = "sartifyllc" models = ["sartifyllc/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka", f"{publisher}/{model_name}"] for model_name in models: truncate_dim = 768 language = "swa" device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") model = SentenceTransformer(model_name, device=device, trust_remote_code=True) tasks = [ mteb.get_task("AfriSentiClassification", languages=["swa"]), mteb.get_task("AfriSentiLangClassification", languages=["swa"]), mteb.get_task("MasakhaNEWSClassification", languages=["swa"]), mteb.get_task("MassiveIntentClassification", languages=["swa"]), mteb.get_task("MassiveScenarioClassification", languages=["swa"]), mteb.get_task("SwahiliNewsClassification", languages=["swa"]), ] evaluation = mteb.MTEB(tasks=tasks) results = evaluation.run(model, output_folder=f"{model_name}") tasks = mteb.get_tasks(task_types=["PairClassification", "Reranking", "BitextMining", "Clustering", "Retrieval"], languages=["swa"]) evaluation = mteb.MTEB(tasks=tasks) results = evaluation.run(model, output_folder=f"{model_name}") ``` """) def display_contribution(): """Display the contribution section.""" st.header("๐Ÿค How to Contribute") st.markdown(""" We welcome and appreciate all contributions! You can help by: ### Table Work - Filling in missing entries. - New models are added as new rows to the leaderboard (maintaining descending order). - Add new benchmarks as new columns in the leaderboard and include them in the benchmarks table (maintaining descending order). ### Code Work - Improving the existing code. - Requesting and implementing new features. """) def display_sponsorship(): """Display the sponsorship section.""" st.header("๐Ÿค Sponsorship") st.markdown(""" This benchmark is Swahili-based, and we need support translating and curating more tasks into Swahili. Sponsorships are welcome to help advance this endeavour. Your sponsorship will facilitate essential translation efforts, bridge language barriers, and make the benchmark accessible to a broader audience. We are grateful for the dedication shown by our collaborators and aim to extend this impact further with the support of sponsors committed to advancing language technologies. """) def main(): setup_page() # Read README content with open("README.md", "r") as f: readme_content = f.read() # Extract and process leaderboard table leaderboard_table = extract_table_from_markdown(readme_content, "| Model Name") df_leaderboard = markdown_table_to_df(leaderboard_table) display_leaderboard(df_leaderboard) display_evaluation() display_contribution() display_sponsorship() st.markdown("---") st.markdown("Thank you for being part of this effort to advance Swahili language technologies!") if __name__ == "__main__": main()