fidocure_note_extractor_2

Sleeping

App Files Files Community

Kevin Wu commited on Oct 9, 2024

Commit

95174f7

1 Parent(s): 854997c

Initial

Browse files

Files changed (2) hide show

app.py +179 -156
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -4,184 +4,203 @@ import os
 import time
 import gradio as gr
 from openai import OpenAI
 import xml.etree.ElementTree as ET
 import re
 import pandas as pd
 import prompts
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 model_name = "gpt-4o-2024-08-06"
-demo = client.beta.assistants.create(
-    name="Information Extractor",
-    instructions="Extract information from this note.",
-    model=model_name,
-    tools=[{"type": "file_search"}],
-)
 def parse_xml_response(xml_string: str) -> pd.DataFrame:
     """
     Parse the XML response from the model and extract all fields into a dictionary,
     then convert it to a pandas DataFrame with a nested index.
     """
-    # Extract only the XML content between the first and last tags
-    xml_content = re.search(r'<.*?>.*</.*?>', xml_string, re.DOTALL)
-    if xml_content:
-        xml_string = xml_content.group(0)
-    else:
-        print("No valid XML content found.")
-        return pd.DataFrame()
     try:
         root = ET.fromstring(xml_string)
     except ET.ParseError as e:
-        print(f"Error parsing XML: {e}")
         return pd.DataFrame()
-    result = {}
-    for element in root:
-        tag = element.tag
-        if tag in ['patient_name', 'date_of_birth', 'sex', 'weight', 'date_of_death']:
-            result[tag] = {
-                'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None,
-                **{child.tag: child.text.strip() if child.text else None
-                   for child in element if child.tag != 'reasoning'}
-            }
-        elif tag in ['traditional_chemo', 'other_cancer_treatments', 'other_conmeds']:
-            if tag not in result:
-                result[tag] = []
-            reasoning = element.find('reasoning')
-            for item in element:
-                if item.tag in ['drug', 'treatment', 'medication']:
-                    date_element = element.find('date')
-                    result[tag].append({
-                        'reasoning': reasoning.text.strip() if reasoning is not None else None,
-                        'name': item.text.strip() if item.text else None,
-                        'date': date_element.text.strip() if date_element is not None and date_element.text else None
-                    })
-        elif tag in ['surgery', 'surgery_outcome', 'metastasis_at_time_of_diagnosis']:
-            result[tag] = {
-                'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None,
-                **{child.tag: child.text.strip() if child.text else None
-                   for child in element if child.tag != 'reasoning'}
-            }
-        elif tag == 'compounding_pharmacy':
-            result[tag] = {
-                'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None,
-                'pharmacy': element.find('pharmacy').text.strip() if element.find('pharmacy') is not None else None
-            }
-        elif tag == 'adverse_effects':
-            if tag not in result:
-                result[tag] = []
-            effect = {
-                'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None
-            }
-            for child in element:
-                if child.tag != 'reasoning':
-                    effect[child.tag] = child.text.strip() if child.text else None
-            if effect:
-                result[tag].append(effect)
-    # Convert to nested DataFrame
-    df_data = {}
-    for key, value in result.items():
-        if isinstance(value, dict):
-            for sub_key, sub_value in value.items():
-                df_data[(key, '1', sub_key)] = [sub_value]
-        elif isinstance(value, list):
-            for i, item in enumerate(value):
-                for sub_key, sub_value in item.items():
-                    df_data[(key, f"{i+1}", sub_key)] = [sub_value]
-        else:
-            df_data[(key, '1', '')] = [value]
-    # Create multi-index DataFrame
-    df = pd.DataFrame(df_data)
-    df.columns = pd.MultiIndex.from_tuples(df.columns)
-    return df
 def get_response(prompt, file_id, assistant_id):
-    thread = client.beta.threads.create(
-        messages=[
-            {
-                "role": "user",
-                "content": prompts.info_prompt,
-                "attachments": [
-                    {"file_id": file_id, "tools": [{"type": "file_search"}]}
-                ],
-            }
-        ]
-    )
-    run = client.beta.threads.runs.create_and_poll(
-        thread_id=thread.id, assistant_id=assistant_id
-    )
-    messages = list(
-        client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id)
-    )
-    message_content = messages[0].content[0].text
-    annotations = message_content.annotations
-    for index, annotation in enumerate(annotations):
-        message_content.value = message_content.value.replace(annotation.text, f"")
-    return message_content.value
 def process(file_content):
-    if not os.path.exists("cache"):
-        os.makedirs("cache")
-    file_name = f"cache/{time.time()}.pdf"
-    with open(file_name, "wb") as f:
-        f.write(file_content)
-    message_file = client.files.create(file=open(file_name, "rb"), purpose="assistants")
-    response = get_response(prompts.info_prompt, message_file.id, demo.id)
-    df = parse_xml_response(response)
-    if df.empty:
-        return "<p>No valid information could be extracted from the provided file.</p>"
-    # Transpose the DataFrame
-    df_transposed = df.T.reset_index()
-    df_transposed.columns = ['Category', 'Index', 'Field', 'Value']
-    df_transposed = df_transposed.sort_values(['Category', 'Index', 'Field'])
-    # Convert to HTML with some basic styling
-    html = df_transposed.to_html(index=False, classes='table table-striped table-bordered', escape=False)
-    # Add some custom CSS for better readability
-    html = f"""
-    <style>
-    .table {{
-        width: 100%;
-        max-width: 100%;
-        margin-bottom: 1rem;
-        background-color: transparent;
-    }}
-    .table td, .table th {{
-        padding: .75rem;
-        vertical-align: top;
-        border-top: 1px solid #dee2e6;
-    }}
-    .table thead th {{
-        vertical-align: bottom;
-        border-bottom: 2px solid #dee2e6;
-    }}
-    .table tbody + tbody {{
-        border-top: 2px solid #dee2e6;
-    }}
-    .table-striped tbody tr:nth-of-type(odd) {{
-        background-color: rgba(0,0,0,.05);
-    }}
-    </style>
-    {html}
-    """
-    return html
 def gradio_interface():
     upload_component = gr.File(label="Upload PDF", type="binary")
@@ -198,4 +217,8 @@ def gradio_interface():
     demo.launch()
 if __name__ == "__main__":
-    gradio_interface()

 import time
 import gradio as gr
 from openai import OpenAI
 import xml.etree.ElementTree as ET
 import re
 import pandas as pd
 import prompts
+import traceback
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 model_name = "gpt-4o-2024-08-06"
+try:
+    demo = client.beta.assistants.create(
+        name="Information Extractor",
+        instructions="Extract information from this note.",
+        model=model_name,
+        tools=[{"type": "file_search"}],
+    )
+except Exception as e:
+    print(f"Error creating assistant: {str(e)}")
+    raise
 def parse_xml_response(xml_string: str) -> pd.DataFrame:
     """
     Parse the XML response from the model and extract all fields into a dictionary,
     then convert it to a pandas DataFrame with a nested index.
     """
     try:
+        # Extract only the XML content between the first and last tags
+        xml_content = re.search(r'<.*?>.*</.*?>', xml_string, re.DOTALL)
+        if xml_content:
+            xml_string = xml_content.group(0)
+        else:
+            print("No valid XML content found.")
+            return pd.DataFrame()
         root = ET.fromstring(xml_string)
+        result = {}
+        for element in root:
+            tag = element.tag
+            if tag in ['patient_name', 'date_of_birth', 'sex', 'weight', 'date_of_death']:
+                result[tag] = {
+                    'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None,
+                    **{child.tag: child.text.strip() if child.text else None
+                       for child in element if child.tag != 'reasoning'}
+                }
+            elif tag in ['traditional_chemo', 'other_cancer_treatments', 'other_conmeds']:
+                if tag not in result:
+                    result[tag] = []
+                reasoning = element.find('reasoning')
+                for item in element:
+                    if item.tag in ['drug', 'treatment', 'medication']:
+                        date_element = element.find('date')
+                        result[tag].append({
+                            'reasoning': reasoning.text.strip() if reasoning is not None else None,
+                            'name': item.text.strip() if item.text else None,
+                            'date': date_element.text.strip() if date_element is not None and date_element.text else None
+                        })
+            elif tag in ['surgery', 'surgery_outcome', 'metastasis_at_time_of_diagnosis']:
+                result[tag] = {
+                    'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None,
+                    **{child.tag: child.text.strip() if child.text else None
+                       for child in element if child.tag != 'reasoning'}
+                }
+            elif tag == 'compounding_pharmacy':
+                result[tag] = {
+                    'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None,
+                    'pharmacy': element.find('pharmacy').text.strip() if element.find('pharmacy') is not None else None
+                }
+            elif tag == 'adverse_effects':
+                if tag not in result:
+                    result[tag] = []
+                effect = {
+                    'reasoning': element.find('reasoning').text.strip() if element.find('reasoning') is not None else None
+                }
+                for child in element:
+                    if child.tag != 'reasoning':
+                        effect[child.tag] = child.text.strip() if child.text else None
+                if effect:
+                    result[tag].append(effect)
+        # Convert to nested DataFrame
+        df_data = {}
+        for key, value in result.items():
+            if isinstance(value, dict):
+                for sub_key, sub_value in value.items():
+                    df_data[(key, '1', sub_key)] = [sub_value]
+            elif isinstance(value, list):
+                for i, item in enumerate(value):
+                    for sub_key, sub_value in item.items():
+                        df_data[(key, f"{i+1}", sub_key)] = [sub_value]
+            else:
+                df_data[(key, '1', '')] = [value]
+        # Create multi-index DataFrame
+        df = pd.DataFrame(df_data)
+        df.columns = pd.MultiIndex.from_tuples(df.columns)
+        return df
     except ET.ParseError as e:
+        print(f"XML parsing error: {str(e)}")
+        print(f"Problematic XML content: {xml_string[:500]}...")  # Print first 500 chars of XML
+        return pd.DataFrame()
+    except Exception as e:
+        print(f"Error in parse_xml_response: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")
         return pd.DataFrame()
 def get_response(prompt, file_id, assistant_id):
+    try:
+        thread = client.beta.threads.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompts.info_prompt,
+                    "attachments": [
+                        {"file_id": file_id, "tools": [{"type": "file_search"}]}
+                    ],
+                }
+            ]
+        )
+        run = client.beta.threads.runs.create_and_poll(
+            thread_id=thread.id, assistant_id=assistant_id
+        )
+        messages = list(
+            client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id)
+        )
+        assert len(messages) == 1, f"Expected 1 message, got {len(messages)}"
+        message_content = messages[0].content[0].text
+        annotations = message_content.annotations
+        for index, annotation in enumerate(annotations):
+            message_content.value = message_content.value.replace(annotation.text, f"")
+        return message_content.value
+    except Exception as e:
+        print(f"Error in get_response: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")
+        raise
 def process(file_content):
+    try:
+        if not os.path.exists("cache"):
+            os.makedirs("cache")
+        file_name = f"cache/{time.time()}.pdf"
+        with open(file_name, "wb") as f:
+            f.write(file_content)
+        message_file = client.files.create(file=open(file_name, "rb"), purpose="assistants")
+        response = get_response(prompts.info_prompt, message_file.id, demo.id)
+        df = parse_xml_response(response)
+        if df.empty:
+            return "<p>No valid information could be extracted from the provided file.</p>"
+        # Transpose the DataFrame
+        df_transposed = df.T.reset_index()
+        df_transposed.columns = ['Category', 'Index', 'Field', 'Value']
+        df_transposed = df_transposed.sort_values(['Category', 'Index', 'Field'])
+        # Convert to HTML with some basic styling
+        html = df_transposed.to_html(index=False, classes='table table-striped table-bordered', escape=False)
+        # Add some custom CSS for better readability
+        html = f"""
+        <style>
+        .table {{
+            width: 100%;
+            max-width: 100%;
+            margin-bottom: 1rem;
+            background-color: transparent;
+        }}
+        .table td, .table th {{
+            padding: .75rem;
+            vertical-align: top;
+            border-top: 1px solid #dee2e6;
+        }}
+        .table thead th {{
+            vertical-align: bottom;
+            border-bottom: 2px solid #dee2e6;
+        }}
+        .table tbody + tbody {{
+            border-top: 2px solid #dee2e6;
+        }}
+        .table-striped tbody tr:nth-of-type(odd) {{
+            background-color: rgba(0,0,0,.05);
+        }}
+        </style>
+        {html}
+        """
+        return html
+    except Exception as e:
+        error_message = f"An error occurred while processing the file: {str(e)}"
+        print(error_message)
+        print(f"Traceback: {traceback.format_exc()}")
+        return f"<p>{error_message}</p>"
 def gradio_interface():
     upload_component = gr.File(label="Upload PDF", type="binary")
     demo.launch()
 if __name__ == "__main__":
+    try:
+        gradio_interface()
+    except Exception as e:
+        print(f"Error launching Gradio interface: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-gradio==3.50.2
 openai==1.51.2
 pandas

+gradio==4.29.0
 openai==1.51.2
 pandas