Spaces:

mic3333
/

Text_summarization

Sleeping

App Files Files Community

mic3333 commited on Sep 7

Commit

86d82de

1 Parent(s): 21b70d2

upload

Browse files

Files changed (4) hide show

.gradio/certificate.pem +31 -0
CLAUDE.md +33 -0
app.py +301 -0
requirements.txt +10 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Repository Overview
+This is a Hugging Face Spaces repository configured for a text summarization project. The repository currently contains minimal setup with just configuration files.
+### Current Structure
+- `README.md`: Hugging Face Spaces configuration with Docker SDK setup
+- Repository is configured as a Hugging Face Space with:
+  - Docker SDK
+  - Pink to purple gradient theme
+  - MIT license
+### Development Setup
+This appears to be an early-stage Hugging Face Spaces project. Based on the configuration:
+- Uses Docker for deployment
+- Intended for text summarization functionality
+- Currently lacks implementation files
+### Next Steps for Development
+When developing this project, you'll likely need to:
+- Add Python requirements file (`requirements.txt`) with Gradio library
+- Create main application file (typically `app.py` for Hugging Face Spaces)
+- Add text summarization function dashboard template using Gradio
+- Configure appropriate Docker setup if not using default
+### Hugging Face Spaces Reference
+Configuration follows Hugging Face Spaces format. See: https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import gradio as gr
+import whisper
+import PyPDF2
+import docx
+from transformers import pipeline
+import io
+import tempfile
+import os
+import numpy as np
+class TextSummarizer:
+    def __init__(self):
+        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+        self.whisper_model = whisper.load_model("base")
+    def extract_text_from_pdf(self, pdf_file):
+        """Extract text from a PDF file object"""
+        try:
+            reader = PyPDF2.PdfReader(pdf_file)
+            text = ""
+            for page in reader.pages:
+                text += page.extract_text() or ""
+            return text
+        except Exception as e:
+            return f"Error reading PDF: {str(e)}"
+    def extract_text_from_docx(self, docx_file):
+        """Extract text from a DOCX file object"""
+        try:
+            doc = docx.Document(docx_file)
+            text = ""
+            for paragraph in doc.paragraphs:
+                text += paragraph.text + "\n"
+            return text
+        except Exception as e:
+            return f"Error reading DOCX: {str(e)}"
+    def process_text_file(self, txt_file):
+        """Extract text from a TXT file object"""
+        try:
+            # The file from Gradio is a temporary file, we can read it directly
+            with open(txt_file.name, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            return f"Error reading TXT file: {str(e)}"
+    def transcribe_audio(self, audio_file):
+        """Transcribe audio file to text using Whisper"""
+        try:
+            result = self.whisper_model.transcribe(audio_file)
+            return result["text"]
+        except Exception as e:
+            return f"Error transcribing audio: {str(e)}"
+    def summarize_text(self, text, max_length=150, min_length=50):
+        """Summarize text using BART model"""
+        try:
+            if len(text.strip()) < 50:
+                return "Text is too short to summarize."
+            summary = self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
+            return summary[0]['summary_text']
+        except Exception as e:
+            return f"Error summarizing text: {str(e)}"
+    def process_file(self, file, summary_length):
+        """Process uploaded file and return summary"""
+        if file is None:
+            return "No file uploaded."
+        file_path = file.name
+        file_extension = os.path.splitext(file_path)[1].lower()
+        max_length = {"Short": 100, "Medium": 150, "Long": 250}[summary_length]
+        min_length = max_length // 3
+        text_extractors = {
+            ".txt": self.process_text_file,
+            ".pdf": self.extract_text_from_pdf,
+            ".docx": self.extract_text_from_docx,
+        }
+        audio_transcribers = {
+            ".mp3": self.transcribe_audio,
+            ".wav": self.transcribe_audio,
+            ".m4a": self.transcribe_audio,
+            ".flac": self.transcribe_audio,
+        }
+        if file_extension in text_extractors:
+            text = text_extractors[file_extension](file)
+        elif file_extension in audio_transcribers:
+            text = audio_transcribers[file_extension](file_path)
+        else:
+            return f"Unsupported file format: {file_extension}"
+        if isinstance(text, str) and text.startswith("Error"):
+            return text
+        summary = self.summarize_text(text, max_length, min_length)
+        return f"**Original Text Length:** {len(text)} characters\n\n**Summary:**\n{summary}"
+    def transcribe_stream(self, audio_chunk, current_transcript):
+        """Transcribe a stream of audio chunks and append to the transcript."""
+        if audio_chunk is None:
+            return current_transcript, current_transcript
+        try:
+            sample_rate, data = audio_chunk
+            # Convert from int16 to float32
+            data = data.astype(np.float32) / 32768.0
+            # Transcribe the audio chunk
+            result = self.whisper_model.transcribe(data, fp16=False)
+            new_text = result['text']
+            updated_transcript = current_transcript + new_text + " "
+            return updated_transcript, updated_transcript
+        except Exception as e:
+            return f"Error during transcription: {str(e)}", current_transcript
+    def convert_file_to_text(self, file):
+        """Extract text from any supported file format."""
+        if file is None:
+            return "No file uploaded for conversion."
+        file_path = file.name
+        file_extension = os.path.splitext(file_path)[1].lower()
+        text_extractors = {
+            ".txt": self.process_text_file,
+            ".pdf": self.extract_text_from_pdf,
+            ".docx": self.extract_text_from_docx,
+        }
+        audio_transcribers = {
+            ".mp3": self.transcribe_audio,
+            ".wav": self.transcribe_audio,
+            ".m4a": self.transcribe_audio,
+            ".flac": self.transcribe_audio,
+        }
+        if file_extension in text_extractors:
+            return text_extractors[file_extension](file)
+        elif file_extension in audio_transcribers:
+            return audio_transcribers[file_extension](file_path)
+        else:
+            return f"Unsupported file format for conversion: {file_extension}"
+def create_interface():
+    summarizer = TextSummarizer()
+    with gr.Blocks(title="Text Summarization Dashboard") as interface:
+        gr.Markdown("Text Summarization Dashboard")
+        gr.Markdown("Manage files, and interact with specialized AI agents for various tasks.")
+        # State component to store the uploaded file
+        uploaded_file_state = gr.State(None)
+        with gr.Tabs():
+            with gr.TabItem("📄 File Management & Conversion"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### Upload File")
+                        file_input = gr.File(
+                            label="Select a file",
+                            file_types=[".txt", ".pdf", ".docx", ".mp3", ".wav", ".m4a", ".flac"]
+                        )
+                        uploaded_file_name = gr.Textbox(label="Current File", interactive=False)
+                        def store_file(file):
+                            if file:
+                                return file, file.name
+                            return None, "No file uploaded"
+                        file_input.upload(
+                            fn=store_file,
+                            inputs=[file_input],
+                            outputs=[uploaded_file_state, uploaded_file_name]
+                        )
+                    with gr.Column(scale=1):
+                        gr.Markdown("### Convert to TXT")
+                        gr.Markdown("Supported formats for conversion to .txt: `.pdf`, `.docx`, `.mp3`, `.wav`, `.m4a`, `.flac`")
+                        convert_btn = gr.Button("Convert to TXT", variant="secondary")
+                        conversion_output = gr.Textbox(
+                            label="Conversion Output",
+                            placeholder="Converted text will appear here...",
+                            lines=8,
+                            interactive=False
+                        )
+                        convert_btn.click(
+                            fn=summarizer.convert_file_to_text,
+                            inputs=[uploaded_file_state],
+                            outputs=[conversion_output]
+                        )
+            with gr.TabItem("✍️ Meeting Summarization"):
+                gr.Markdown("### Meeting Summarization")
+                gr.Markdown("Generate summaries from your meeting transcripts and other documents.")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        summary_length = gr.Dropdown(
+                            choices=["Short", "Medium", "Long"],
+                            value="Medium",
+                            label="Summary Length",
+                            info="Short: ~300 words, Medium: ~500+ words, Long: ~1000+ words"
+                        )
+                        submit_btn = gr.Button("Generate Summary", variant="primary")
+                    with gr.Column(scale=2):
+                        output = gr.Textbox(
+                            label="Summary Output",
+                            lines=10,
+                            placeholder="Your summary will appear here..."
+                        )
+                with gr.Accordion("⚙️ Model Settings", open=False):
+                    gr.Markdown("### Model Selection & Fine-Tuning")
+                    gr.Markdown("Choose different models and configure their parameters.")
+                    with gr.Row():
+                        gr.Dropdown(
+                            label="Select Summarization Model",
+                            choices=["facebook/bart-large-cnn", "t5-small", "google/pegasus-xsum"],
+                            value="facebook/bart-large-cnn"
+                        )
+                    with gr.Accordion("Fine-Tuning Options", open=False):
+                        gr.Slider(label="Min Tokens", minimum=10, maximum=200, step=5, value=50)
+                        gr.Slider(label="Max Tokens", minimum=50, maximum=500, step=10, value=150)
+                        gr.Slider(label="Temperature", minimum=0.1, maximum=1.5, step=0.1, value=0.7)
+                        gr.Slider(label="Top-K", minimum=0, maximum=100, step=1, value=50, info="0 to disable")
+                        gr.Slider(label="Top-P (Nucleus Sampling)", minimum=0.0, maximum=1.0, step=0.05, value=0.95, info="0 to disable")
+                        gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.2)
+                        gr.Slider(label="Number of Beams", minimum=1, maximum=8, step=1, value=4)
+            with gr.TabItem("🔴 Live Meeting Recording & Summarization"):
+                gr.Markdown("### Live Meeting Transcription & Summarization")
+                gr.Markdown("Record audio from your microphone, get a live transcript, and generate a summary.")
+                live_transcript_state = gr.State("")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        audio_input = gr.Audio(
+                            label="Live Audio",
+                            sources="microphone",
+                            streaming=True,
+                        )
+                    with gr.Column(scale=2):
+                        live_transcript_output = gr.Textbox(
+                            label="Live Transcript",
+                            placeholder="Transcript will appear here...",
+                            lines=15,
+                        )
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        live_summary_length = gr.Dropdown(
+                            choices=["Short", "Medium", "Long"],
+                            value="Medium",
+                            label="Summary Length"
+                        )
+                        live_summary_btn = gr.Button("Generate Summary", variant="primary")
+                    with gr.Column(scale=2):
+                        live_summary_output = gr.Textbox(
+                            label="Meeting Summary",
+                            placeholder="Summary will appear here...",
+                            lines=5,
+                        )
+                audio_input.stream(
+                    fn=summarizer.transcribe_stream,
+                    inputs=[audio_input, live_transcript_state],
+                    outputs=[live_transcript_output, live_transcript_state],
+                )
+                def generate_live_summary(transcript, length_option):
+                    max_len = {"Short": 100, "Medium": 150, "Long": 250}[length_option]
+                    min_len = max_len // 3
+                    return summarizer.summarize_text(transcript, max_length=max_len, min_length=min_len)
+                live_summary_btn.click(
+                    fn=generate_live_summary,
+                    inputs=[live_transcript_output, live_summary_length],
+                    outputs=[live_summary_output],
+                )
+        submit_btn.click(
+            fn=summarizer.process_file,
+            inputs=[uploaded_file_state, summary_length],
+            outputs=output
+        )
+    return interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio==4.44.1
+transformers==4.35.2
+torch==2.1.1
+openai-whisper==20231117
+PyPDF2==3.0.1
+python-docx==1.1.0
+datasets==2.14.6
+accelerate==0.24.1
+sentencepiece==0.1.99
+protobuf==4.25.0