Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	
		Matthew Upson
		
	commited on
		
		
					chg: Extract locations from Wellcome examples
Browse files- .dvc/.gitignore +3 -0
- .dvc/config +5 -0
- .dvcignore +3 -0
- .gitignore +2 -1
- app.py +32 -24
- data/processed/.gitignore +1 -0
- dvc.lock +20 -4
- dvc.yaml +9 -0
- params.yaml +1 -0
- requirements.txt +2 -9
- src/process_docs.py +61 -0
- src/subset_data.py +30 -14
- unpinned_requirements.txt +2 -1
    	
        .dvc/.gitignore
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            /config.local
         | 
| 2 | 
            +
            /tmp
         | 
| 3 | 
            +
            /cache
         | 
    	
        .dvc/config
    ADDED
    
    | @@ -0,0 +1,5 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            [core]
         | 
| 2 | 
            +
                remote = s3
         | 
| 3 | 
            +
                autostage = true
         | 
| 4 | 
            +
            ['remote "s3"']
         | 
| 5 | 
            +
                url = s3://mantisnlp-blogs/ner
         | 
    	
        .dvcignore
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Add patterns of files dvc should ignore, which could improve
         | 
| 2 | 
            +
            # the performance. Learn more at
         | 
| 3 | 
            +
            # https://dvc.org/doc/user-guide/dvcignore
         | 
    	
        .gitignore
    CHANGED
    
    | @@ -1,2 +1,3 @@ | |
| 1 | 
             
            __pycache__/
         | 
| 2 | 
            -
            images/
         | 
|  | 
|  | |
| 1 | 
             
            __pycache__/
         | 
| 2 | 
            +
            images/
         | 
| 3 | 
            +
            .venv/
         | 
    	
        app.py
    CHANGED
    
    | @@ -1,40 +1,48 @@ | |
|  | |
|  | |
| 1 | 
             
            import spacy
         | 
|  | |
| 2 | 
             
            import streamlit as st
         | 
| 3 |  | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 |  | 
| 7 | 
            -
             | 
| 8 | 
            -
                colors = {"LOCATION": "#5cff84"}
         | 
| 9 | 
            -
                options = {"ents": ["LOCATION"], "colors": colors}
         | 
| 10 | 
            -
                html = spacy.displacy.render(entities, style="ent", options=options, manual=True)
         | 
| 11 | 
            -
                html = html.replace("\n", " ")
         | 
| 12 |  | 
| 13 | 
            -
             | 
| 14 |  | 
|  | |
|  | |
| 15 |  | 
| 16 | 
             
            HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
         | 
| 17 |  | 
| 18 | 
            -
            st.header("Location Entity Recognition Demo 🔎🌆🌍")
         | 
| 19 | 
            -
            threshold = st.sidebar.slider("Threshold", value=0.5, min_value=0.0, max_value=1.0)
         | 
| 20 | 
            -
            display_probabilities = st.sidebar.checkbox("Display probabilities")
         | 
| 21 |  | 
|  | |
|  | |
|  | |
|  | |
| 22 |  | 
| 23 | 
            -
             | 
|  | |
| 24 |  | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
            doc = nlp(text)
         | 
| 28 |  | 
| 29 | 
            -
             | 
| 30 | 
            -
                {"start": ent.start_char, "end": ent.end_char, "label": "LOCATION"}
         | 
| 31 | 
            -
                for ent in doc.ents
         | 
| 32 | 
            -
            ]
         | 
| 33 | 
            -
            foo = {"text": text, "ents": ents}
         | 
| 34 |  | 
| 35 |  | 
| 36 | 
            -
             | 
| 37 | 
            -
            print(doc.ents)
         | 
| 38 |  | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import random
         | 
| 2 | 
            +
             | 
| 3 | 
             
            import spacy
         | 
| 4 | 
            +
            import srsly
         | 
| 5 | 
             
            import streamlit as st
         | 
| 6 |  | 
| 7 | 
            +
            nlp = spacy.load("en_core_web_trf")
         | 
|  | |
| 8 |  | 
| 9 | 
            +
            # Load pre-processed grants from disk.
         | 
|  | |
|  | |
|  | |
|  | |
| 10 |  | 
| 11 | 
            +
            grants = list(srsly.read_jsonl("data/processed/entities.jsonl"))
         | 
| 12 |  | 
| 13 | 
            +
            colors = {"GPE": "#5cff84", "LOC": "#5cff84"}
         | 
| 14 | 
            +
            options = {"ents": ["GPE", "LOC"], "colors": colors}
         | 
| 15 |  | 
| 16 | 
             
            HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
         | 
| 17 |  | 
|  | |
|  | |
|  | |
| 18 |  | 
| 19 | 
            +
            def render_entities(doc, colors: dict, options: dict) -> str:
         | 
| 20 | 
            +
                """
         | 
| 21 | 
            +
                Takes a SpaCy doc
         | 
| 22 | 
            +
                """
         | 
| 23 |  | 
| 24 | 
            +
                #if isinstance(doc, spacy.tokens.doc.Doc):
         | 
| 25 | 
            +
                #    doc = doc.to_json()
         | 
| 26 |  | 
| 27 | 
            +
                html = spacy.displacy.render(doc, style="ent", options=options)
         | 
| 28 | 
            +
                html = html.replace("\n", " ")
         | 
|  | |
| 29 |  | 
| 30 | 
            +
                return html
         | 
|  | |
|  | |
|  | |
|  | |
| 31 |  | 
| 32 |  | 
| 33 | 
            +
            st.header("Location Entity Recognition Demo 🔎🌆🌍")
         | 
|  | |
| 34 |  | 
| 35 | 
            +
            st.subheader("Look for Locations")
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            if st.button("Show new example", key="text"):
         | 
| 38 | 
            +
                sample = random.choice(grants)
         | 
| 39 | 
            +
                doc = nlp(sample["text"])
         | 
| 40 | 
            +
                html = render_entities(doc, colors, options)
         | 
| 41 | 
            +
                text = st.text_area("Text input", value=sample["text"], height=200)
         | 
| 42 | 
            +
                st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
         | 
| 43 | 
            +
            else:
         | 
| 44 | 
            +
                sample = random.choice(grants)
         | 
| 45 | 
            +
                doc = nlp(sample["text"])
         | 
| 46 | 
            +
                html = render_entities(doc, colors, options)
         | 
| 47 | 
            +
                text = st.text_area("Text input", value=sample["text"], height=200, help="Enter text here and click the 'Find Locations' button to search for entities.")
         | 
| 48 | 
            +
                st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
         | 
    	
        data/processed/.gitignore
    CHANGED
    
    | @@ -1 +1,2 @@ | |
| 1 | 
             
            /wellcome_grant_descriptions.csv
         | 
|  | 
|  | |
| 1 | 
             
            /wellcome_grant_descriptions.csv
         | 
| 2 | 
            +
            /entities.jsonl
         | 
    	
        dvc.lock
    CHANGED
    
    | @@ -7,9 +7,25 @@ stages: | |
| 7 | 
             
                  md5: 5c0d0e532709648b61625e7e130dfaa4
         | 
| 8 | 
             
                  size: 31028261
         | 
| 9 | 
             
                - path: src/subset_data.py
         | 
| 10 | 
            -
                  md5:  | 
| 11 | 
            -
                  size:  | 
|  | |
|  | |
|  | |
| 12 | 
             
                outs:
         | 
| 13 | 
             
                - path: data/processed/wellcome_grant_descriptions.csv
         | 
| 14 | 
            -
                  md5:  | 
| 15 | 
            -
                  size:  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 7 | 
             
                  md5: 5c0d0e532709648b61625e7e130dfaa4
         | 
| 8 | 
             
                  size: 31028261
         | 
| 9 | 
             
                - path: src/subset_data.py
         | 
| 10 | 
            +
                  md5: f4cffd497cb8341cf05728e89cbb0871
         | 
| 11 | 
            +
                  size: 1008
         | 
| 12 | 
            +
                params:
         | 
| 13 | 
            +
                  params.yaml:
         | 
| 14 | 
            +
                    n_docs: 500
         | 
| 15 | 
             
                outs:
         | 
| 16 | 
             
                - path: data/processed/wellcome_grant_descriptions.csv
         | 
| 17 | 
            +
                  md5: 18dd6a7611d7f53b1067def7ba075cba
         | 
| 18 | 
            +
                  size: 644736
         | 
| 19 | 
            +
              entities:
         | 
| 20 | 
            +
                cmd: python src/process_docs.py
         | 
| 21 | 
            +
                deps:
         | 
| 22 | 
            +
                - path: data/processed/wellcome_grant_descriptions.csv
         | 
| 23 | 
            +
                  md5: 18dd6a7611d7f53b1067def7ba075cba
         | 
| 24 | 
            +
                  size: 644736
         | 
| 25 | 
            +
                - path: src/process_docs.py
         | 
| 26 | 
            +
                  md5: 1f570b1aa0f44b0bb131317c305deff5
         | 
| 27 | 
            +
                  size: 1309
         | 
| 28 | 
            +
                outs:
         | 
| 29 | 
            +
                - path: data/processed/entities.jsonl
         | 
| 30 | 
            +
                  md5: 26846cdd657a516281b24c376a93f018
         | 
| 31 | 
            +
                  size: 214902
         | 
    	
        dvc.yaml
    CHANGED
    
    | @@ -4,5 +4,14 @@ stages: | |
| 4 | 
             
                deps:
         | 
| 5 | 
             
                  - src/subset_data.py
         | 
| 6 | 
             
                  - data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
         | 
|  | |
|  | |
| 7 | 
             
                outs:
         | 
| 8 | 
             
                  - data/processed/wellcome_grant_descriptions.csv
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 4 | 
             
                deps:
         | 
| 5 | 
             
                  - src/subset_data.py
         | 
| 6 | 
             
                  - data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
         | 
| 7 | 
            +
                params:
         | 
| 8 | 
            +
                  - n_docs
         | 
| 9 | 
             
                outs:
         | 
| 10 | 
             
                  - data/processed/wellcome_grant_descriptions.csv
         | 
| 11 | 
            +
              entities:
         | 
| 12 | 
            +
                cmd: python src/process_docs.py
         | 
| 13 | 
            +
                deps:
         | 
| 14 | 
            +
                  - src/process_docs.py
         | 
| 15 | 
            +
                  - data/processed/wellcome_grant_descriptions.csv
         | 
| 16 | 
            +
                outs:
         | 
| 17 | 
            +
                  - data/processed/entities.jsonl
         | 
    	
        params.yaml
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            n_docs: 500
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -14,10 +14,8 @@ confection==0.0.3 | |
| 14 | 
             
            cymem==2.0.6
         | 
| 15 | 
             
            decorator==5.1.1
         | 
| 16 | 
             
            entrypoints==0.4
         | 
| 17 | 
            -
            filelock==3.8.0
         | 
| 18 | 
             
            gitdb==4.0.9
         | 
| 19 | 
             
            GitPython==3.1.29
         | 
| 20 | 
            -
            huggingface-hub==0.10.1
         | 
| 21 | 
             
            idna==3.4
         | 
| 22 | 
             
            importlib-metadata==5.0.0
         | 
| 23 | 
             
            importlib-resources==5.10.0
         | 
| @@ -31,7 +29,7 @@ packaging==21.3 | |
| 31 | 
             
            pandas==1.5.0
         | 
| 32 | 
             
            pathy==0.6.2
         | 
| 33 | 
             
            Pillow==9.2.0
         | 
| 34 | 
            -
             | 
| 35 | 
             
            preshed==3.0.7
         | 
| 36 | 
             
            protobuf==3.20.3
         | 
| 37 | 
             
            pyarrow==9.0.0
         | 
| @@ -44,8 +42,6 @@ pyrsistent==0.18.1 | |
| 44 | 
             
            python-dateutil==2.8.2
         | 
| 45 | 
             
            pytz==2022.4
         | 
| 46 | 
             
            pytz-deprecation-shim==0.1.0.post0
         | 
| 47 | 
            -
            PyYAML==6.0
         | 
| 48 | 
            -
            regex==2022.9.13
         | 
| 49 | 
             
            requests==2.28.1
         | 
| 50 | 
             
            rich==12.6.0
         | 
| 51 | 
             
            semver==2.13.0
         | 
| @@ -58,15 +54,12 @@ spacy-loggers==1.0.3 | |
| 58 | 
             
            srsly==2.4.4
         | 
| 59 | 
             
            streamlit==1.13.0
         | 
| 60 | 
             
            thinc==8.1.3
         | 
| 61 | 
            -
            tokenizers==0.13.1
         | 
| 62 | 
             
            toml==0.10.2
         | 
| 63 | 
             
            toolz==0.12.0
         | 
| 64 | 
            -
            torch==1.12.1
         | 
| 65 | 
             
            tornado==6.2
         | 
| 66 | 
             
            tqdm==4.64.1
         | 
| 67 | 
            -
            transformers==4.23.1
         | 
| 68 | 
             
            typer==0.4.2
         | 
| 69 | 
            -
             | 
| 70 | 
             
            tzdata==2022.4
         | 
| 71 | 
             
            tzlocal==4.2
         | 
| 72 | 
             
            urllib3==1.26.12
         | 
|  | |
| 14 | 
             
            cymem==2.0.6
         | 
| 15 | 
             
            decorator==5.1.1
         | 
| 16 | 
             
            entrypoints==0.4
         | 
|  | |
| 17 | 
             
            gitdb==4.0.9
         | 
| 18 | 
             
            GitPython==3.1.29
         | 
|  | |
| 19 | 
             
            idna==3.4
         | 
| 20 | 
             
            importlib-metadata==5.0.0
         | 
| 21 | 
             
            importlib-resources==5.10.0
         | 
|  | |
| 29 | 
             
            pandas==1.5.0
         | 
| 30 | 
             
            pathy==0.6.2
         | 
| 31 | 
             
            Pillow==9.2.0
         | 
| 32 | 
            +
            pkgutil-resolve-name==1.3.10
         | 
| 33 | 
             
            preshed==3.0.7
         | 
| 34 | 
             
            protobuf==3.20.3
         | 
| 35 | 
             
            pyarrow==9.0.0
         | 
|  | |
| 42 | 
             
            python-dateutil==2.8.2
         | 
| 43 | 
             
            pytz==2022.4
         | 
| 44 | 
             
            pytz-deprecation-shim==0.1.0.post0
         | 
|  | |
|  | |
| 45 | 
             
            requests==2.28.1
         | 
| 46 | 
             
            rich==12.6.0
         | 
| 47 | 
             
            semver==2.13.0
         | 
|  | |
| 54 | 
             
            srsly==2.4.4
         | 
| 55 | 
             
            streamlit==1.13.0
         | 
| 56 | 
             
            thinc==8.1.3
         | 
|  | |
| 57 | 
             
            toml==0.10.2
         | 
| 58 | 
             
            toolz==0.12.0
         | 
|  | |
| 59 | 
             
            tornado==6.2
         | 
| 60 | 
             
            tqdm==4.64.1
         | 
|  | |
| 61 | 
             
            typer==0.4.2
         | 
| 62 | 
            +
            typing-extensions==4.4.0
         | 
| 63 | 
             
            tzdata==2022.4
         | 
| 64 | 
             
            tzlocal==4.2
         | 
| 65 | 
             
            urllib3==1.26.12
         | 
    	
        src/process_docs.py
    ADDED
    
    | @@ -0,0 +1,61 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import csv
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import spacy
         | 
| 4 | 
            +
            import srsly
         | 
| 5 | 
            +
            import tqdm
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            nlp = spacy.load("en_core_web_trf")
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
         | 
| 10 | 
            +
            OUTPUT_FILE = "data/processed/entities.jsonl"
         | 
| 11 | 
            +
            ENTITY_SUBSET = ["GPE", "LOC"]
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            def process_documents(input_file: str, output_file: str):
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                data = []
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                print(f"Reading data from {input_file}...")
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                with open(input_file, "r") as f:
         | 
| 21 | 
            +
                    reader = csv.reader(f)
         | 
| 22 | 
            +
                    next(reader)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    for row in reader:
         | 
| 25 | 
            +
                        data.append(row[0])
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                print(f"Processing {len(data)} documents...")
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                entities = []
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                for doc_ in tqdm.tqdm(data):
         | 
| 32 | 
            +
                    doc = nlp(doc_)
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    # Get a list of found entities
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    ents = [
         | 
| 37 | 
            +
                        {
         | 
| 38 | 
            +
                            "text": ent.text,
         | 
| 39 | 
            +
                            "label": ent.label_,
         | 
| 40 | 
            +
                            "start": ent.start_char,
         | 
| 41 | 
            +
                            "end": ent.end_char,
         | 
| 42 | 
            +
                        }
         | 
| 43 | 
            +
                        for ent in doc.ents
         | 
| 44 | 
            +
                        if ent.label_ in ENTITY_SUBSET
         | 
| 45 | 
            +
                    ]
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    if ents:
         | 
| 48 | 
            +
                        entities.append(
         | 
| 49 | 
            +
                            {
         | 
| 50 | 
            +
                                "text": doc.text,
         | 
| 51 | 
            +
                                "ents": ents,
         | 
| 52 | 
            +
                            }
         | 
| 53 | 
            +
                        )
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                print(f"Writing {len(entities)} documents to {output_file}...")
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                srsly.write_jsonl(output_file, entities)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
             | 
| 60 | 
            +
            if __name__ == "__main__":
         | 
| 61 | 
            +
                process_documents(INPUT_FILE, OUTPUT_FILE)
         | 
    	
        src/subset_data.py
    CHANGED
    
    | @@ -1,25 +1,41 @@ | |
| 1 | 
             
            import numpy as np
         | 
| 2 | 
             
            import pandas as pd
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 3 |  | 
| 4 | 
             
            INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
         | 
| 5 | 
             
            OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
         | 
| 6 |  | 
| 7 | 
            -
            print(f"Reading data from {INPUT_FILE}")
         | 
| 8 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 9 |  | 
| 10 | 
            -
             | 
|  | |
| 11 |  | 
| 12 | 
            -
             | 
| 13 | 
            -
                data | 
| 14 | 
            -
                .replace("Not available", np.nan)
         | 
| 15 | 
            -
                .dropna()
         | 
| 16 | 
            -
                .drop_duplicates()
         | 
| 17 | 
            -
                .reset_index(drop=True)
         | 
| 18 | 
            -
                .sample(1000)
         | 
| 19 | 
            -
            )
         | 
| 20 |  | 
| 21 | 
            -
            print(f"Number of rows: {data.shape[0]}")
         | 
| 22 | 
            -
            print(f"Number of unique rows: {data['Description'].nunique()}")
         | 
| 23 |  | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
|  | 
|  | |
| 1 | 
             
            import numpy as np
         | 
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
            +
            import yaml
         | 
| 4 | 
            +
             | 
| 5 | 
            +
             | 
| 6 | 
            +
            def load_config(config_file: str) -> dict:
         | 
| 7 | 
            +
                with open(config_file) as f:
         | 
| 8 | 
            +
                    config = yaml.safe_load(f)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                return config
         | 
| 11 | 
            +
             | 
| 12 |  | 
| 13 | 
             
            INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
         | 
| 14 | 
             
            OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
         | 
| 15 |  | 
|  | |
| 16 |  | 
| 17 | 
            +
            def subset_docs(input_file: str, output_file: str, sample: int):
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                print(f"Reading data from {input_file}")
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                data = pd.read_csv(input_file)
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                data = (
         | 
| 24 | 
            +
                    data[["Description"]]
         | 
| 25 | 
            +
                    .replace("Not available", np.nan)
         | 
| 26 | 
            +
                    .dropna()
         | 
| 27 | 
            +
                    .drop_duplicates()
         | 
| 28 | 
            +
                    .reset_index(drop=True)
         | 
| 29 | 
            +
                    .sample(sample)
         | 
| 30 | 
            +
                )
         | 
| 31 |  | 
| 32 | 
            +
                print(f"Number of rows: {data.shape[0]}")
         | 
| 33 | 
            +
                print(f"Number of unique rows: {data['Description'].nunique()}")
         | 
| 34 |  | 
| 35 | 
            +
                print(f"Saving file to {output_file}")
         | 
| 36 | 
            +
                data.to_csv(output_file, index=False)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 37 |  | 
|  | |
|  | |
| 38 |  | 
| 39 | 
            +
            if __name__ == "__main__":
         | 
| 40 | 
            +
                params = load_config("params.yaml")
         | 
| 41 | 
            +
                subset_docs(INPUT_FILE, OUTPUT_FILE, sample=params["n_docs"])
         | 
    	
        unpinned_requirements.txt
    CHANGED
    
    | @@ -1,2 +1,3 @@ | |
| 1 | 
             
            streamlit
         | 
| 2 | 
            -
            spacy
         | 
|  | 
|  | |
| 1 | 
             
            streamlit
         | 
| 2 | 
            +
            spacy
         | 
| 3 | 
            +
            pandas
         | 
