Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -1 +1,110 @@ | |
| 1 | 
            -
            import  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import pandas as pd
         | 
| 2 | 
            +
            from tfidf_matcher.ngrams import ngrams
         | 
| 3 | 
            +
            from sklearn.feature_extraction.text import TfidfVectorizer
         | 
| 4 | 
            +
            from sklearn.neighbors import NearestNeighbors
         | 
| 5 | 
            +
            import gradio as gr
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            def matcher(original=[], lookup=[], outname='Original', ngram_length=3, cutoff=0.8):
         | 
| 8 | 
            +
                k_matches=1
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                # Enforce listtype, set to lower
         | 
| 11 | 
            +
                original = list(original.split(","))
         | 
| 12 | 
            +
                lookup = list(lookup.split(","))
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                # print(original)
         | 
| 15 | 
            +
                # print(lookup)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                original_lower = [x.lower() for x in original]
         | 
| 18 | 
            +
                lookup_lower = [x.lower() for x in lookup]
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                # Set ngram length for TfidfVectorizer callable
         | 
| 21 | 
            +
                def ngrams_user(string, n=ngram_length):
         | 
| 22 | 
            +
                    return ngrams(string, n)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                # Generate Sparse TFIDF matrix from Lookup corpus
         | 
| 25 | 
            +
                vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams_user)
         | 
| 26 | 
            +
                tf_idf_lookup = vectorizer.fit_transform(lookup_lower)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                # Fit KNN model to sparse TFIDF matrix generated from Lookup
         | 
| 29 | 
            +
                nbrs = NearestNeighbors(n_neighbors=k_matches, n_jobs=-1, metric="cosine").fit(tf_idf_lookup)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                # Use nbrs model to obtain nearest matches in lookup dataset. Vectorize first.
         | 
| 32 | 
            +
                tf_idf_original = vectorizer.transform(original_lower)
         | 
| 33 | 
            +
                distances, lookup_indices = nbrs.kneighbors(tf_idf_original)
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                # Extract top Match Score (which is just the distance to the nearest neighbour),
         | 
| 36 | 
            +
                # Original match item, and Lookup matches.
         | 
| 37 | 
            +
                original_name_list = []
         | 
| 38 | 
            +
                confidence_list = []
         | 
| 39 | 
            +
                index_list = []
         | 
| 40 | 
            +
                lookup_list = []
         | 
| 41 | 
            +
                print(len(lookup_indices))
         | 
| 42 | 
            +
                # i is 0:len(original), j is list of lists of matches
         | 
| 43 | 
            +
                for i, lookup_index in enumerate(lookup_indices):
         | 
| 44 | 
            +
                    original_name = original[i]
         | 
| 45 | 
            +
                    # lookup names in lookup list
         | 
| 46 | 
            +
                    lookups = [lookup[index] for index in lookup_index]
         | 
| 47 | 
            +
                    # transform distances to confidences and store
         | 
| 48 | 
            +
                    confidence = [1 - round(dist, 2) for dist in distances[i]]
         | 
| 49 | 
            +
                    original_name_list.append(original_name)
         | 
| 50 | 
            +
                    # store index
         | 
| 51 | 
            +
                    index_list.append(lookup_index)
         | 
| 52 | 
            +
                    confidence_list.append(confidence)
         | 
| 53 | 
            +
                    lookup_list.append(lookups)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                # Convert to df
         | 
| 56 | 
            +
                df_orig_name = pd.DataFrame(original_name_list, columns=[outname])
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                df_lookups = pd.DataFrame(
         | 
| 59 | 
            +
                    lookup_list, columns=["Match"]
         | 
| 60 | 
            +
                )
         | 
| 61 | 
            +
                df_confidence = pd.DataFrame(
         | 
| 62 | 
            +
                    confidence_list,
         | 
| 63 | 
            +
                    columns=["Match Confidence"],
         | 
| 64 | 
            +
                )
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                # bind columns
         | 
| 67 | 
            +
                matches = pd.concat([df_orig_name, df_lookups, df_confidence], axis=1)
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                # reorder columns | can be skipped
         | 
| 70 | 
            +
                lookup_cols = list(matches.columns.values)
         | 
| 71 | 
            +
                lookup_cols_reordered = [lookup_cols[0]]
         | 
| 72 | 
            +
                for i in range(1, k_matches + 1):
         | 
| 73 | 
            +
                    lookup_cols_reordered.append(lookup_cols[i])
         | 
| 74 | 
            +
                    lookup_cols_reordered.append(lookup_cols[i + k_matches])
         | 
| 75 | 
            +
                    # lookup_cols_reordered.append(lookup_cols[i + 2 * k_matches])
         | 
| 76 | 
            +
                matches = matches[lookup_cols_reordered]
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                matches = matches.loc[matches["Match Confidence"] > cutoff]
         | 
| 79 | 
            +
                matches.sort_values(by=["Match Confidence"], ascending=False, inplace=True)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                return matches
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            def combine(a, b):
         | 
| 84 | 
            +
                return a + " " + b
         | 
| 85 | 
            +
             | 
| 86 | 
            +
             | 
| 87 | 
            +
            with gr.Blocks() as demo:
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                with gr.Row():
         | 
| 90 | 
            +
                    with gr.Column():
         | 
| 91 | 
            +
                        txt = gr.Textbox(label="Input a list of names", value='Courtney Walsh,Curtly Ambrose,Malcolm Marshall,Brian Lara,Viv Richards,Obama',lines=2)
         | 
| 92 | 
            +
                        txt_2 = gr.Textbox(label="Input some names to match", value="Walsh, Ambrose, Marshall, Lara",lines=2)
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                # with gr.Row():
         | 
| 95 | 
            +
                    with gr.Column():
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                        outty =  gr.Dataframe(
         | 
| 98 | 
            +
                                headers=["Original", "Match", "Confidence"],
         | 
| 99 | 
            +
                                datatype=["str", "str", "number"],
         | 
| 100 | 
            +
                                label="Matched",
         | 
| 101 | 
            +
                            )
         | 
| 102 | 
            +
             | 
| 103 | 
            +
             | 
| 104 | 
            +
                btn = gr.Button(value="Submit")
         | 
| 105 | 
            +
                btn.click(matcher, inputs=[txt, txt_2], outputs=[outty])
         | 
| 106 | 
            +
             | 
| 107 | 
            +
             | 
| 108 | 
            +
             | 
| 109 | 
            +
            if __name__ == "__main__":
         | 
| 110 | 
            +
                demo.launch()
         |