Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| import re | |
| import gradio as gr | |
| import os | |
| import docx2txt | |
| tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("checkpoint-64840").to("cpu") | |
| def summarize(text_file): | |
| file_extension = os.path.splitext(text_file.name)[1] | |
| if file_extension == ".txt": | |
| # Load text from a txt file | |
| with open(text_file.name, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| elif file_extension == ".docx": | |
| # Load text from a Word file | |
| text = docx2txt.process(text_file.name) | |
| else: | |
| raise ValueError(f"Unsupported file type: {file_extension}") | |
| input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cpu") | |
| global_attention_mask = torch.zeros_like(input_ids) | |
| # set global_attention_mask on first token | |
| global_attention_mask[:, 0] = 1 | |
| sequences = model.generate(input_ids, global_attention_mask=global_attention_mask).sequences | |
| summary = tokenizer.batch_decode(sequences)[0] | |
| return text, summary | |
| iface = gr.Interface( | |
| fn=summarize, | |
| inputs=gr.inputs.File(label="Upload a txt file or a Word file for the input text"), | |
| outputs=[gr.outputs.Textbox(label="Original text"), gr.outputs.Textbox(label="Summary")], | |
| title="Academic Paper Summarization Demo", | |
| description="Upload a txt file or a Word file for the input text. Get a summary generated by a small T5 model from Hugging Face.", | |
| ) | |
| iface.launch() | |