| | |
| | |
| | |
| | import gradio as gr |
| | import pandas as pd |
| | import time |
| | from pathlib import Path |
| | import yaml |
| | from typing import List, Tuple, Optional |
| | import re |
| | from gpt4all import GPT4All |
| | from huggingface_hub import hf_hub_download |
| |
|
| | |
| | |
| | |
| | with open('_config.yaml', 'r') as f: |
| | config = yaml.safe_load(f) |
| |
|
| | |
| | default_config = config.get('defaults', {}) |
| | prompts_config = config.get('prompts', {}) |
| | title_config = config.get('title_generation', {}) |
| |
|
| | |
| | bnotes_prompt = prompts_config.get('bnotes', {}).get('prompt', 'Write comprehensive bulleted notes summarizing the provided text, with headings and terms in bold.') |
| | title_prompt = title_config.get('prompt', 'The content between backticks is part of a book-chapter. write 8-11 words describing it.') |
| |
|
| | |
| | summary_model_alias = default_config.get('summary', 'cognitivetech/obook_summary:q6_k') |
| | title_model_alias = default_config.get('title', 'notes') |
| |
|
| | |
| | |
| | |
| | models_config = { |
| | 'summary': { |
| | 'repo_id': 'cognitivetech/Mistral-7b-Inst-0.2-Bulleted-Notes_GGUF', |
| | 'filename': 'mistral-7b-inst-0.2-bulleted-notes.Q5_K_M.gguf', |
| | 'local_dir': 'models', |
| | 'template': { |
| | 'prefix': '<|im_start|>user\n', |
| | 'suffix': ' <|im_end|>\n<|im_start|>assistant\n', |
| | 'stop_tokens': ['<|im_start|>', '<|im_end|>'] |
| | }, |
| | 'params': { |
| | 'num_ctx': 8000, |
| | 'num_gpu': -1, |
| | 'num_predict': 4000 |
| | } |
| | }, |
| | 'title': { |
| | 'repo_id': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF', |
| | 'filename': 'mistral-7b-instruct-v0.2.Q5_0.gguf', |
| | 'local_dir': 'models', |
| | 'template': { |
| | 'prefix': '<s>[INST] ', |
| | 'suffix': ' [/INST]', |
| | 'stop_tokens': ['</s>'] |
| | }, |
| | 'params': { |
| | 'num_ctx': 8000, |
| | 'num_gpu': -1, |
| | 'num_predict': 100 |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | print("Downloading and initializing models...") |
| |
|
| | |
| | for model_type in ['summary', 'title']: |
| | cfg = models_config[model_type] |
| | print(f"Downloading {model_type} model...") |
| | hf_hub_download( |
| | repo_id=cfg['repo_id'], |
| | filename=cfg['filename'], |
| | local_dir=cfg['local_dir'], |
| | local_dir_use_symlinks=False |
| | ) |
| |
|
| | |
| | print("Initializing summary model...") |
| | summary_model = GPT4All( |
| | model_name=models_config['summary']['filename'], |
| | model_path=models_config['summary']['local_dir'], |
| | allow_download=False, |
| | device="cpu" |
| | ) |
| |
|
| | print("Initializing title model...") |
| | title_model = GPT4All( |
| | model_name=models_config['title']['filename'], |
| | model_path=models_config['title']['local_dir'], |
| | allow_download=False, |
| | device="cpu" |
| | ) |
| |
|
| | |
| | |
| | summary_model.config["promptTemplate"] = models_config['summary']['template']['prefix'] + "{0}" + models_config['summary']['template']['suffix'] |
| | summary_model.config["systemPrompt"] = "" |
| |
|
| | |
| | title_model.config["promptTemplate"] = models_config['title']['template']['prefix'] + "{0}" + models_config['title']['template']['suffix'] |
| | title_model.config["systemPrompt"] = "" |
| |
|
| | print("Models initialized successfully!") |
| |
|
| | |
| | |
| | |
| | def sanitize_text(text: str) -> str: |
| | """Clean text for processing.""" |
| | return text.strip() |
| |
|
| | def bold_text_before_colon(text: str) -> str: |
| | """Bold any text before the first colon that isn't already bolded.""" |
| | pattern = r'^([ \t]*-[ \t]*)([a-zA-Z].*?):' |
| | replacement = r'\1**\2:**' |
| | return re.sub(pattern, replacement, text, flags=re.MULTILINE) |
| |
|
| | def generate_title(text: str, temperature: float = 0.3) -> str: |
| | """Generate a title for the given text.""" |
| | prompt = f"```{text[:500]}```\n\n{title_prompt}" |
| | |
| | |
| | full_prompt = models_config['title']['template']['prefix'] + prompt + models_config['title']['template']['suffix'] |
| | |
| | outputs = [] |
| | for token in title_model.generate( |
| | prompt=full_prompt, |
| | temp=temperature, |
| | top_k=40, |
| | top_p=0.95, |
| | max_tokens=100, |
| | streaming=True |
| | ): |
| | outputs.append(token) |
| | |
| | title = "".join(outputs).strip() |
| | |
| | title = re.sub(r'^.*?\[/INST\]\s*', '', title) |
| | title = re.sub(r'\s+', ' ', title) |
| | return title[:150] |
| |
|
| | def generate_summary(text: str, temperature: float = 0.5, max_tokens: int = 4000) -> str: |
| | """Generate bulleted notes summary.""" |
| | prompt = f"```{text}```\n\n{bnotes_prompt}" |
| | |
| | |
| | full_prompt = models_config['summary']['template']['prefix'] + prompt + models_config['summary']['template']['suffix'] |
| | |
| | outputs = [] |
| | for token in summary_model.generate( |
| | prompt=full_prompt, |
| | temp=temperature, |
| | top_k=40, |
| | top_p=0.95, |
| | max_tokens=max_tokens, |
| | streaming=True |
| | ): |
| | outputs.append(token) |
| | |
| | summary = "".join(outputs).strip() |
| | |
| | summary = re.sub(r'^.*?assistant\s*', '', summary) |
| | summary = bold_text_before_colon(summary) |
| | return summary |
| |
|
| | |
| | |
| | |
| | def process_csv( |
| | file_obj, |
| | use_existing_titles: bool = True, |
| | generate_missing_titles: bool = True, |
| | temperature: float = 0.5, |
| | title_temperature: float = 0.3 |
| | ): |
| | """Process CSV file with title and text columns.""" |
| | |
| | |
| | try: |
| | df = pd.read_csv(file_obj.name) |
| | except Exception as e: |
| | return None, f"Error reading CSV: {str(e)}" |
| | |
| | |
| | if 'text' not in df.columns: |
| | return None, "CSV must contain 'text' column" |
| | |
| | |
| | output_rows = [] |
| | |
| | |
| | for idx, row in df.iterrows(): |
| | text = str(row.get('text', '')) |
| | original_title = str(row.get('title', '')) if 'title' in df.columns and use_existing_titles else '' |
| | |
| | |
| | if not text.strip(): |
| | continue |
| | |
| | |
| | start_time = time.time() |
| | |
| | if original_title and use_existing_titles: |
| | title = original_title |
| | title_generated = False |
| | elif generate_missing_titles: |
| | title = generate_title(text, temperature=title_temperature) |
| | title_generated = True |
| | else: |
| | title = f"Text_{idx+1}" |
| | title_generated = False |
| | |
| | |
| | summary = generate_summary(text, temperature=temperature) |
| | end_time = time.time() |
| | |
| | |
| | elapsed_time = end_time - start_time |
| | |
| | |
| | output_row = { |
| | 'title': title, |
| | 'text': text, |
| | 'text.len': len(text), |
| | 'output': summary, |
| | 'output.len': len(summary), |
| | 'time': elapsed_time |
| | } |
| | |
| | |
| | if original_title and use_existing_titles: |
| | output_row['original_title'] = original_title |
| | output_row['title_generated'] = title_generated |
| | |
| | output_rows.append(output_row) |
| | |
| | |
| | yield pd.DataFrame(output_rows), f"Processed {idx+1}/{len(df)} rows..." |
| | |
| | |
| | output_df = pd.DataFrame(output_rows) |
| | |
| | return output_df, f"Processing complete! Processed {len(output_df)} rows." |
| |
|
| | def format_for_display(df): |
| | """Format DataFrame for nice display.""" |
| | if df is None or len(df) == 0: |
| | return pd.DataFrame() |
| | |
| | display_df = df.copy() |
| | |
| | |
| | if 'text' in display_df.columns: |
| | display_df['text'] = display_df['text'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x) |
| | |
| | if 'output' in display_df.columns: |
| | display_df['output'] = display_df['output'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x) |
| | |
| | |
| | if 'time' in display_df.columns: |
| | display_df['time'] = display_df['time'].apply(lambda x: f"{x:.2f}s") |
| | |
| | |
| | display_order = ['title', 'text.len', 'output.len', 'time'] |
| | display_order = [col for col in display_order if col in display_df.columns] |
| | |
| | |
| | other_cols = [col for col in display_df.columns if col not in display_order] |
| | display_order.extend(other_cols) |
| | |
| | return display_df[display_order] |
| |
|
| | |
| | |
| | |
| | title = "Mistral-7B Text Summarizer with Title Generation" |
| | description = """ |
| | Process CSV files with text content and generate: |
| | 1. Titles (using Mistral-7B-Instruct-v0.2) |
| | 2. Bulleted notes summaries (using Mistral-7b-Inst-0.2-Bulleted-Notes) |
| | |
| | CSV must contain at least a 'text' column. Optionally include 'title' column to use existing titles. |
| | """ |
| |
|
| | with gr.Blocks(title=title, css=""" |
| | .output-table { max-height: 500px; overflow-y: auto; } |
| | .progress-text { color: #666; font-style: italic; } |
| | """) as demo: |
| | |
| | gr.Markdown(f"# {title}") |
| | gr.Markdown(description) |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | |
| | gr.Markdown("## Input Settings") |
| | |
| | file_input = gr.File( |
| | label="Upload CSV File", |
| | file_types=[".csv"], |
| | type="file" |
| | ) |
| | |
| | use_existing_titles = gr.Checkbox( |
| | label="Use existing titles from CSV", |
| | value=True, |
| | info="If unchecked, will generate titles for all rows" |
| | ) |
| | |
| | generate_missing_titles = gr.Checkbox( |
| | label="Generate titles for missing rows", |
| | value=True, |
| | info="Generate titles only when 'title' column is empty" |
| | ) |
| | |
| | temperature = gr.Slider( |
| | label="Summary Temperature", |
| | value=0.5, |
| | minimum=0.0, |
| | maximum=1.0, |
| | step=0.05, |
| | info="Higher values = more creative, lower = more deterministic" |
| | ) |
| | |
| | title_temperature = gr.Slider( |
| | label="Title Temperature", |
| | value=0.3, |
| | minimum=0.0, |
| | maximum=1.0, |
| | step=0.05, |
| | info="Temperature for title generation" |
| | ) |
| | |
| | process_btn = gr.Button("Process CSV", variant="primary") |
| | |
| | with gr.Column(scale=2): |
| | |
| | gr.Markdown("## Results") |
| | |
| | progress_text = gr.Textbox( |
| | label="Progress", |
| | value="Ready to process...", |
| | interactive=False |
| | ) |
| | |
| | display_df = gr.Dataframe( |
| | label="Preview", |
| | headers=[], |
| | datatype=["str", "str", "number", "number", "str"], |
| | row_count=5, |
| | col_count=(5, "fixed"), |
| | wrap=True, |
| | elem_classes=["output-table"] |
| | ) |
| | |
| | download_csv = gr.File(label="Download Full Results") |
| | |
| | |
| | def update_preview(df, message): |
| | """Update the preview display.""" |
| | display_df = format_for_display(df) |
| | return display_df, message |
| | |
| | def process_and_update(file_obj, use_titles, gen_missing, temp, title_temp): |
| | """Process CSV and yield incremental updates.""" |
| | if file_obj is None: |
| | yield None, "Please upload a CSV file", None |
| | |
| | results_df = None |
| | for df_chunk, progress_msg in process_csv(file_obj, use_titles, gen_missing, temp, title_temp): |
| | if df_chunk is not None: |
| | results_df = df_chunk |
| | yield format_for_display(df_chunk), progress_msg, None |
| | |
| | if results_df is not None: |
| | |
| | output_path = "processed_output.csv" |
| | results_df.to_csv(output_path, index=False) |
| | yield format_for_display(results_df), "Processing complete!", output_path |
| | |
| | |
| | process_btn.click( |
| | fn=process_and_update, |
| | inputs=[file_input, use_existing_titles, generate_missing_titles, temperature, title_temperature], |
| | outputs=[display_df, progress_text, download_csv] |
| | ) |
| | |
| | |
| | def on_file_upload(file_obj): |
| | if file_obj is None: |
| | return pd.DataFrame(), "No file uploaded" |
| | |
| | try: |
| | df = pd.read_csv(file_obj.name) |
| | preview_df = format_for_display(df.head(5)) |
| | info = f"Loaded {len(df)} rows. Columns: {', '.join(df.columns.tolist())}" |
| | return preview_df, info |
| | except Exception as e: |
| | return pd.DataFrame(), f"Error loading file: {str(e)}" |
| | |
| | file_input.change( |
| | fn=on_file_upload, |
| | inputs=[file_input], |
| | outputs=[display_df, progress_text] |
| | ) |
| |
|
| | |
| | |
| | |
| | if __name__ == "__main__": |
| | demo.launch( |
| | server_name="0.0.0.0", |
| | server_port=7860, |
| | share=False, |
| | debug=True |
| | ) |