Spaces:
Runtime error
Runtime error
import gradio as gr | |
import PyPDF2 | |
from PyPDF2 import PdfReader | |
from io import BytesIO | |
import pytesseract | |
from PIL import Image | |
import spacy | |
import json | |
from transformers import pipeline | |
from PyPDF2 import PdfReader | |
ner_model = pipeline('token-classification', model='dslim/bert-large-NER') | |
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn") | |
ner_models = { | |
'bert-large-NER': 'dslim/bert-large-NER', | |
'bioNER': 'd4data/biomedical-ner-all', | |
'SpaCy English NER': 'en_core_web_trf', | |
} | |
spacy_ner_model = spacy.load(ner_models['SpaCy English NER']) | |
ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all') | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") | |
from spacy import displacy | |
def extract_text_from_pdf(pdf_bytes): | |
""" | |
Extracts text from a PDF file using PyPDF2. | |
Parameters: | |
- pdf_bytes (bytes): The content of the PDF file in bytes. | |
Returns: | |
- text (str): Extracted text from the PDF. | |
""" | |
text='' | |
pdf_file=BytesIO(pdf_bytes) | |
pdf_reader=PdfReader(pdf_file) | |
for page_number in range(len(pdf_reader.pages)): | |
page=pdf_reader.pages[page_number] | |
text+=page.extract_text() | |
return text | |
def extract_text_from_image_or_pdf(file_bytes): | |
""" | |
Extracts text from either a PDF or an image file using PyPDF2 and pytesseract. | |
Parameters: | |
- file_bytes (bytes): The content of the file in bytes. | |
Returns: | |
- text (str): Extracted text from the file. | |
""" | |
try: | |
if file_bytes.startswith(b'%PDF'): | |
# PDF file | |
text = extract_text_from_pdf(file_bytes) | |
else: | |
# Assume image file | |
image = Image.open(BytesIO(file_bytes)) | |
text = pytesseract.image_to_string(image) | |
return text | |
except Exception as e: | |
return f"Error extracting text: {str(e)}" | |
def perform_ner(text, model_name): | |
""" | |
Performs Named Entity Recognition (NER) on the given text using the specified NER model. | |
Parameters: | |
- text (str): The input text on which NER will be performed. | |
- model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER'). | |
Returns: | |
- extracted_entities (list): A list of dictionaries containing information about the recognized entities. | |
Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'. | |
- error_message (str): If an error occurs during the NER process, an error message is returned. | |
""" | |
try: | |
if model_name == 'SpaCy English NER': | |
doc = spacy_ner_model(text) | |
extracted_entities = [{'text': ent.text, 'type': ent.label_, | |
'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents] | |
elif model_name == 'bert-large-NER': | |
entities = ner_model(text) | |
extracted_entities = [{'text': entity['word'], 'type': entity['entity'], | |
'start_index': entity['start'], 'end_index': entity['end']} for entity in entities] | |
else: | |
entities = ner_model_bio(text) | |
extracted_entities = [{'text': entity['word'], 'type': entity['entity'], | |
'start_index': entity['start'], 'end_index': entity['end']} for entity in entities] | |
return extracted_entities | |
except Exception as e: | |
return f"Error performing NER: {str(e)}" | |
def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer): | |
""" | |
This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer. | |
It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup. | |
Parameters: | |
- `text` (str): The raw input text. | |
- `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity. | |
- `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting. | |
- `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text. | |
Returns: | |
- `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup. | |
""" | |
highlighted_text = "" | |
current_pos = 0 | |
for ent in entities: | |
start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O') | |
entity_text = text[start:end] | |
# Tokenize the entity text | |
encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False) | |
tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity) | |
tokenized_entity_length = len(tokenized_entity_text) | |
# Add non-entity text | |
highlighted_text += text[current_pos:start] | |
# Add highlighted entity text with color and label on the same line | |
color = color_mapping.get(label,'#4D94FF') | |
highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>" | |
# Update current position | |
current_pos = end | |
# Add any remaining non-entity text | |
highlighted_text += text[current_pos:] | |
return highlighted_text | |
def highlight_entities(text, entities,model_name): | |
""" | |
Highlights named entities in the given text and returns HTML with colored annotations. | |
Parameters: | |
- text (str): The input text containing named entities. | |
- entities (list): A list of dictionaries containing information about the recognized entities. | |
Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'. | |
- model_name (str): The name of the NER model used ('SpaCy English NER'). | |
Returns: | |
- colored_text (str): HTML with colored annotations highlighting the recognized entities. | |
- error_message (str): If an error occurs during the highlighting process, an error message is returned. | |
""" | |
try: | |
if model_name == 'SpaCy English NER': | |
doc = spacy_ner_model(text) | |
color_mapping = { | |
"DATE": "#4D94FF", # Blue | |
"PERSON": "#4CAF50", # Green | |
"EVENT": "#FF6666", # Salmon | |
"FAC": "#66B2FF", # Sky Blue | |
"GPE": "#FFCC99", # Light Apricot | |
"LANGUAGE": "#FF80BF", # Pink | |
"LAW": "#66FF99", # Mint | |
"LOC": "#809FFF", # Lavender Blue | |
"MONEY": "#FFFF99", # Light Yellow | |
"NORP": "#808000", # Olive Green | |
"ORDINAL": "#FF9999", # Misty Rose | |
"ORG": "#FFB366", # Light Peach | |
"PERCENT": "#FF99FF", # Orchid | |
"PRODUCT": "#FF6666", # Salmon | |
"QUANTITY": "#CC99FF", # Pastel Purple | |
"TIME": "#FFD54F", # Amber | |
"WORK_OF_ART": "#FFC266" , # Light Orange | |
"CARDINAL": "#008080" # Teal | |
} | |
options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping} | |
html = displacy.render(doc, style="ent", options=options, page=True) | |
colored_text = html | |
return colored_text | |
else: | |
color_mapping = { | |
'O': 'pink', | |
'B-MIS': 'red', | |
'I-MIS': 'brown', | |
'B-PER': 'green', | |
'I-PER': '#FFD54F', | |
'B-ORG': 'orange', | |
'I-ORG': '#FF6666', | |
'B-LOC': 'purple', | |
'I-LOC': '#FFCC99', | |
} | |
highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer) | |
return highlighted_example | |
except Exception as e: | |
return f"Error highlighting entities: {str(e)}" | |
def summarize_text(input_text): | |
""" | |
The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline. | |
The function takes an `input_text` parameter, representing the text that needs to be summarized. | |
Parameters: | |
- **input_text (str):** The input text that needs to be summarized. | |
Returns: | |
- **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters, | |
including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`, | |
to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned. | |
""" | |
summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True) | |
summarized_text = summarized_text[0]['summary_text'] | |
return summarized_text | |
def image_ner_tool(file, model_name): | |
""" | |
Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file. | |
The extracted text is highlighted with colored annotations based on recognized entities. | |
Parameters: | |
- file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file. | |
- model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER'). | |
Returns: | |
- text (str): Extracted text from the input file. | |
- highlighted_text (str): HTML with colored annotations highlighting the recognized entities. | |
- reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities. | |
""" | |
reformatted_ner_output = "" | |
try: | |
if isinstance(file, str): # If the input is a file path | |
with open(file, 'rb') as file_stream: | |
file_bytes = file_stream.read() | |
else: # If the input is a byte stream | |
file_bytes = file.getvalue() | |
text = extract_text_from_image_or_pdf(file_bytes) | |
entities = perform_ner(text, model_name) | |
highlighted_text = highlight_entities(text, entities,model_name) | |
reformatted_ner_output = json.dumps(entities, indent=2) | |
summary = summarize_text(text) | |
return text, highlighted_text, reformatted_ner_output, summary | |
except Exception as e: | |
error_message = f"Error processing file: {str(e)}" | |
return error_message, "", reformatted_ner_output | |
import pandas as pd | |
def store_data_to_csv(inputs, outputs): | |
print(inputs) | |
print(outputs) | |
if isinstance(inputs, str): # If the input is a file path | |
with open(inputs, 'rb') as file_stream: | |
file_bytes = file_stream.read() | |
else: # If the input is a byte stream | |
file_bytes = inputs.getvalue() | |
extracted_text = extract_text_from_image_or_pdf(file_bytes) | |
named_entities=perform_ner(extracted_text, outputs) | |
df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]}) | |
df.to_csv("log.csv", mode='a', index=False, header=False) | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
<p style="text-align: center; font-weight: bold; font-size: 44px;"> | |
Intelligent Document Processing | |
</p> | |
<p style="text-align: center;"> | |
Upload a PDF or an image file to extract text and identify named entities | |
</p> | |
""" | |
) | |
with gr.Row() as row: | |
with gr.Column(): | |
text1 =gr.File(label="Upload File") | |
model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model") | |
btn = gr.Button("submit") | |
with gr.Column(): | |
with gr.Tab("Extracted Text"): | |
output1=gr.Textbox(label="Extracted Text", container= True) | |
with gr.Tab("Highlighted Entitied"): | |
output2=gr.HTML(label="Highlighted Text") | |
with gr.Tab("Summarized Text"): | |
output3=gr.HTML(label="Summarized text") | |
with gr.Tab("Named Entities Extracted"): | |
output4=gr.HTML(label="Named Entities") | |
store_button = gr.Button("Store Data to CSV") | |
gr.Examples( | |
[ | |
[ # Text to display above the image | |
"The year is 2043.pdf", # Path to the image file | |
"SpaCy English NER" # Selected value for the dropdown menu | |
] | |
], | |
[text1, model], | |
) | |
btn.click( | |
image_ner_tool, | |
[text1, model], | |
[output1, output2, output4, output3], | |
) | |
store_button.click( | |
store_data_to_csv, | |
[text1, model], | |
) | |
demo.launch() | |