Spaces:

phyloforfun
/

VoucherVision

Running

File size: 14,328 Bytes

e91ac58

import requests
from urllib.parse import urlencode
from Levenshtein import ratio
from fuzzywuzzy import fuzz

class WFONameMatcher:
    def __init__(self):
        self.base_url = "https://list.worldfloraonline.org/matching_rest.php?"
        self.N_BEST_CANDIDATES = 10
        self.NULL_DICT = {
                        "WFO_exact_match": False,
                        "WFO_exact_match_name": "",
                        "WFO_candidate_names": "",
                        "WFO_best_match": "",
                        "WFO_placement": "",
                        "WFO_override_OCR": False,
                    }
        self.SEP = '|'

    def extract_input_string(self, record):
        primary_input = f"{record.get('scientificName', '').strip()} {record.get('scientificNameAuthorship', '').strip()}".strip()
        secondary_input = ' '.join(filter(None, [record.get('genus', '').strip(), 
                                                 record.get('subgenus', '').strip(), 
                                                 record.get('specificEpithet', '').strip(), 
                                                 record.get('infraspecificEpithet', '').strip()])).strip()

        return primary_input, secondary_input

    def query_wfo_name_matching(self, input_string, check_homonyms=True, check_rank=True, accept_single_candidate=True):
        params = {
            "input_string": input_string,
            "check_homonyms": check_homonyms,
            "check_rank": check_rank,
            "method": "full",
            "accept_single_candidate": accept_single_candidate,
        }

        full_url = self.base_url + urlencode(params)

        response = requests.get(full_url)
        if response.status_code == 200:
            return response.json()
        else:
            return {"error": True, "message": "Failed to fetch data from WFO API"}
    
    def query_and_process(self, record):
        primary_input, secondary_input = self.extract_input_string(record)
        
        # Query with primary input
        primary_result = self.query_wfo_name_matching(primary_input)
        primary_processed, primary_ranked_candidates = self.process_wfo_response(primary_result, primary_input)

        if primary_processed.get('WFO_exact_match'):
            print("Selected Primary --- Exact Primary & Unchecked Secondary")
            return primary_processed
        else:
            # Query with secondary input
            secondary_result = self.query_wfo_name_matching(secondary_input)
            secondary_processed, secondary_ranked_candidates = self.process_wfo_response(secondary_result, secondary_input)

            if secondary_processed.get('WFO_exact_match'):
                print("Selected Secondary --- Unchecked Primary & Exact Secondary")
                return secondary_processed
            
            else:
                # Both failed, just return the first failure
                if (primary_processed.get("WFO_candidate_names") == '') and (secondary_processed.get("WFO_candidate_names") == ''):
                    print("Selected Primary --- Failed Primary & Failed Secondary")
                    return primary_processed
                
                # 1st failed, just return the second
                elif (primary_processed.get("WFO_candidate_names") == '') and (len(secondary_processed.get("WFO_candidate_names")) > 0):
                    print("Selected Secondary --- Failed Primary & Partial Secondary")
                    return secondary_processed
                
                # 2nd failed, just return the first
                elif (len(primary_processed.get("WFO_candidate_names")) > 0) and (secondary_processed.get("WFO_candidate_names") == ''):
                    print("Selected Primary --- Partial Primary & Failed Secondary")
                    return primary_processed

                # Both have partial matches, compare and rerank
                elif (len(primary_processed.get("WFO_candidate_names")) > 0) and (len(secondary_processed.get("WFO_candidate_names")) > 0):
                    # Combine and sort results, ensuring no duplicates
                    combined_candidates = list(set(primary_ranked_candidates + secondary_ranked_candidates))
                    combined_candidates.sort(key=lambda x: (x[1], x[0]), reverse=True)  # Sort by similarity score, then name
                    
                    # Replace candidates with combined_candidates and combined best match 
                    best_score_primary = primary_processed["WFO_candidate_names"][0][1]
                    best_score_secondary = secondary_processed["WFO_candidate_names"][0][1]

                    # Extracting only the candidate names from the top candidates
                    top_candidates = combined_candidates[:self.N_BEST_CANDIDATES]
                    cleaned_candidates = [cand[0] for cand in top_candidates]

                    if best_score_primary >= best_score_secondary:
                            
                        primary_processed["WFO_candidate_names"] = cleaned_candidates
                        primary_processed["WFO_best_match"] = cleaned_candidates[0]
                        
                        response_placement = self.query_wfo_name_matching(primary_processed["WFO_best_match"])
                        placement_exact_match = response_placement.get("match")
                        primary_processed["WFO_placement"] = placement_exact_match.get("placement", '')
                        
                        print("Selected Primary --- Partial Primary & Partial Secondary")
                        return primary_processed
                    else:
                        secondary_processed["WFO_candidate_names"] = cleaned_candidates
                        secondary_processed["WFO_best_match"] = cleaned_candidates[0]

                        response_placement = self.query_wfo_name_matching(secondary_processed["WFO_best_match"])
                        placement_exact_match = response_placement.get("match")
                        secondary_processed["WFO_placement"] = placement_exact_match.get("placement", '')

                        print("Selected Secondary --- Partial Primary & Partial Secondary")
                        return secondary_processed
                else:
                    return self.NULL_DICT

    def process_wfo_response(self, response, query):
        simplified_response = {}
        ranked_candidates = None

        exact_match = response.get("match")
        simplified_response["WFO_exact_match"] = bool(exact_match)

        candidates = response.get("candidates", [])
        candidate_names = [candidate["full_name_plain"] for candidate in candidates] if candidates else []

        if not exact_match and candidate_names:
            cleaned_candidates, ranked_candidates = self._rank_candidates_by_similarity(query, candidate_names)
            simplified_response["WFO_candidate_names"] = cleaned_candidates
            simplified_response["WFO_best_match"] = cleaned_candidates[0] if cleaned_candidates else ''
        elif exact_match:
            simplified_response["WFO_candidate_names"] = exact_match.get("full_name_plain")
            simplified_response["WFO_best_match"] = exact_match.get("full_name_plain")
        else:
            simplified_response["WFO_candidate_names"] = ''
            simplified_response["WFO_best_match"] = ''

        # Call WFO again to update placement using WFO_best_match
        try:
            response_placement = self.query_wfo_name_matching(simplified_response["WFO_best_match"])
            placement_exact_match = response_placement.get("match")
            simplified_response["WFO_placement"] = placement_exact_match.get("placement", '')
        except:
            simplified_response["WFO_placement"] = ''

        return simplified_response, ranked_candidates
    
    def _rank_candidates_by_similarity(self, query, candidates):
        string_similarities = []
        fuzzy_similarities = {candidate: fuzz.ratio(query, candidate) for candidate in candidates}
        query_words = query.split()

        for candidate in candidates:
            candidate_words = candidate.split()
            # Calculate word similarities and sum them up
            word_similarities = [ratio(query_word, candidate_word) for query_word, candidate_word in zip(query_words, candidate_words)]
            total_word_similarity = sum(word_similarities)

            # Calculate combined similarity score (average of word and fuzzy similarities)
            fuzzy_similarity = fuzzy_similarities[candidate]
            combined_similarity = (total_word_similarity + fuzzy_similarity) / 2
            string_similarities.append((candidate, combined_similarity))

        # Sort the candidates based on combined similarity, higher scores first
        ranked_candidates = sorted(string_similarities, key=lambda x: x[1], reverse=True)

        # Extracting only the candidate names from the top candidates
        top_candidates = ranked_candidates[:self.N_BEST_CANDIDATES]
        cleaned_candidates = [cand[0] for cand in top_candidates]
        
        return cleaned_candidates, ranked_candidates
    
    def check_WFO(self, record, replace_if_success_wfo):
        self.replace_if_success_wfo = replace_if_success_wfo

        # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
        simplified_response = self.query_and_process(record)
        simplified_response['WFO_override_OCR'] = False

        # best_match
        if simplified_response.get('WFO_exact_match'):
            simplified_response['WFO_exact_match_name'] = simplified_response.get('WFO_best_match')
        else:
            simplified_response['WFO_exact_match_name'] = ''

        # placement
        wfo_placement = simplified_response.get('WFO_placement', '')
        if wfo_placement:
            parts = wfo_placement.split('/')[1:]
            simplified_response['WFO_placement'] = self.SEP.join(parts)
        else:
            simplified_response['WFO_placement'] = ''

        if simplified_response.get('WFO_exact_match') and replace_if_success_wfo:
            simplified_response['WFO_override_OCR'] = True
            name_parts = simplified_response.get('WFO_placement').split('$')[0]
            name_parts = name_parts.split(self.SEP)
            record['order'] = name_parts[3]
            record['family'] = name_parts[4]
            record['genus'] = name_parts[5]
            record['specificEpithet'] = name_parts[6]
            record['scientificName'] = simplified_response.get('WFO_exact_match_name')

        return record, simplified_response
    
def validate_taxonomy_WFO(record_dict, replace_if_success_wfo=False):
    Matcher = WFONameMatcher()
    try:    
        record_dict, WFO_dict = Matcher.check_WFO(record_dict, replace_if_success_wfo)
        return record_dict, WFO_dict
    except:
        return record_dict, Matcher.NULL_DICT

'''
if __name__ == "__main__":
    Matcher = WFONameMatcher()
    # input_string = "Rhopalocarpus alterfolius"
    record_exact_match ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "Hypericum prolificum",
        "scientificNameAuthorship": "",

        "genus": "Hypericum",
        "subgenus": "",
        "specificEpithet": "prolificum",
        "infraspecificEpithet": "",
    }
    record_partialPrimary_exactSecondary ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "Hyperic prolificum",
        "scientificNameAuthorship": "",

        "genus": "Hypericum",
        "subgenus": "",
        "specificEpithet": "prolificum",
        "infraspecificEpithet": "",
    }
    record_exactPrimary_partialSecondary ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "Hypericum prolificum",
        "scientificNameAuthorship": "",

        "genus": "Hyperic",
        "subgenus": "",
        "specificEpithet": "prolificum",
        "infraspecificEpithet": "",
    }
    record_partialPrimary_partialSecondary ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "Hyperic prolificum",
        "scientificNameAuthorship": "",

        "genus": "Hypericum",
        "subgenus": "",
        "specificEpithet": "prolific",
        "infraspecificEpithet": "",
    }
    record_partialPrimary_partialSecondary_swap ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "Hypericum prolific",
        "scientificNameAuthorship": "",

        "genus": "Hyperic",
        "subgenus": "",
        "specificEpithet": "prolificum",
        "infraspecificEpithet": "",
    }
    record_errorPrimary_partialSecondary ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "ricum proli",
        "scientificNameAuthorship": "",

        "genus": "Hyperic",
        "subgenus": "",
        "specificEpithet": "prolificum",
        "infraspecificEpithet": "",
    }
    record_partialPrimary_errorSecondary ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "Hyperic prolificum",
        "scientificNameAuthorship": "",

        "genus": "ricum",
        "subgenus": "",
        "specificEpithet": "proli",
        "infraspecificEpithet": "",
    }
    record_errorPrimary_errorSecondary ={
        "order": "Malpighiales",
        "family": "Hypericaceae",
        "scientificName": "ricum proli",
        "scientificNameAuthorship": "",

        "genus": "ricum",
        "subgenus": "",
        "specificEpithet": "proli",
        "infraspecificEpithet": "",
    }
    options = [record_exact_match,
               record_partialPrimary_exactSecondary,
               record_exactPrimary_partialSecondary,
               record_partialPrimary_partialSecondary,
               record_partialPrimary_partialSecondary_swap,
               record_errorPrimary_partialSecondary,
               record_partialPrimary_errorSecondary,
               record_errorPrimary_errorSecondary]
    for opt in options:
        simplified_response = Matcher.check_WFO(opt)
        print(json.dumps(simplified_response, indent=4))
'''