import requests from urllib.parse import urlencode from Levenshtein import ratio from fuzzywuzzy import fuzz class WFONameMatcher: def __init__(self): self.base_url = "https://list.worldfloraonline.org/matching_rest.php?" self.N_BEST_CANDIDATES = 10 self.NULL_DICT = { "WFO_exact_match": False, "WFO_exact_match_name": "", "WFO_candidate_names": "", "WFO_best_match": "", "WFO_placement": "", "WFO_override_OCR": False, } self.SEP = '|' def extract_input_string(self, record): primary_input = f"{record.get('scientificName', '').strip()} {record.get('scientificNameAuthorship', '').strip()}".strip() secondary_input = ' '.join(filter(None, [record.get('genus', '').strip(), record.get('subgenus', '').strip(), record.get('specificEpithet', '').strip(), record.get('infraspecificEpithet', '').strip()])).strip() return primary_input, secondary_input def query_wfo_name_matching(self, input_string, check_homonyms=True, check_rank=True, accept_single_candidate=True): params = { "input_string": input_string, "check_homonyms": check_homonyms, "check_rank": check_rank, "method": "full", "accept_single_candidate": accept_single_candidate, } full_url = self.base_url + urlencode(params) response = requests.get(full_url) if response.status_code == 200: return response.json() else: return {"error": True, "message": "Failed to fetch data from WFO API"} def query_and_process(self, record): primary_input, secondary_input = self.extract_input_string(record) # Query with primary input primary_result = self.query_wfo_name_matching(primary_input) primary_processed, primary_ranked_candidates = self.process_wfo_response(primary_result, primary_input) if primary_processed.get('WFO_exact_match'): print("Selected Primary --- Exact Primary & Unchecked Secondary") return primary_processed else: # Query with secondary input secondary_result = self.query_wfo_name_matching(secondary_input) secondary_processed, secondary_ranked_candidates = self.process_wfo_response(secondary_result, secondary_input) if secondary_processed.get('WFO_exact_match'): print("Selected Secondary --- Unchecked Primary & Exact Secondary") return secondary_processed else: # Both failed, just return the first failure if (primary_processed.get("WFO_candidate_names") == '') and (secondary_processed.get("WFO_candidate_names") == ''): print("Selected Primary --- Failed Primary & Failed Secondary") return primary_processed # 1st failed, just return the second elif (primary_processed.get("WFO_candidate_names") == '') and (len(secondary_processed.get("WFO_candidate_names")) > 0): print("Selected Secondary --- Failed Primary & Partial Secondary") return secondary_processed # 2nd failed, just return the first elif (len(primary_processed.get("WFO_candidate_names")) > 0) and (secondary_processed.get("WFO_candidate_names") == ''): print("Selected Primary --- Partial Primary & Failed Secondary") return primary_processed # Both have partial matches, compare and rerank elif (len(primary_processed.get("WFO_candidate_names")) > 0) and (len(secondary_processed.get("WFO_candidate_names")) > 0): # Combine and sort results, ensuring no duplicates combined_candidates = list(set(primary_ranked_candidates + secondary_ranked_candidates)) combined_candidates.sort(key=lambda x: (x[1], x[0]), reverse=True) # Sort by similarity score, then name # Replace candidates with combined_candidates and combined best match best_score_primary = primary_processed["WFO_candidate_names"][0][1] best_score_secondary = secondary_processed["WFO_candidate_names"][0][1] # Extracting only the candidate names from the top candidates top_candidates = combined_candidates[:self.N_BEST_CANDIDATES] cleaned_candidates = [cand[0] for cand in top_candidates] if best_score_primary >= best_score_secondary: primary_processed["WFO_candidate_names"] = cleaned_candidates primary_processed["WFO_best_match"] = cleaned_candidates[0] response_placement = self.query_wfo_name_matching(primary_processed["WFO_best_match"]) placement_exact_match = response_placement.get("match") primary_processed["WFO_placement"] = placement_exact_match.get("placement", '') print("Selected Primary --- Partial Primary & Partial Secondary") return primary_processed else: secondary_processed["WFO_candidate_names"] = cleaned_candidates secondary_processed["WFO_best_match"] = cleaned_candidates[0] response_placement = self.query_wfo_name_matching(secondary_processed["WFO_best_match"]) placement_exact_match = response_placement.get("match") secondary_processed["WFO_placement"] = placement_exact_match.get("placement", '') print("Selected Secondary --- Partial Primary & Partial Secondary") return secondary_processed else: return self.NULL_DICT def process_wfo_response(self, response, query): simplified_response = {} ranked_candidates = None exact_match = response.get("match") simplified_response["WFO_exact_match"] = bool(exact_match) candidates = response.get("candidates", []) candidate_names = [candidate["full_name_plain"] for candidate in candidates] if candidates else [] if not exact_match and candidate_names: cleaned_candidates, ranked_candidates = self._rank_candidates_by_similarity(query, candidate_names) simplified_response["WFO_candidate_names"] = cleaned_candidates simplified_response["WFO_best_match"] = cleaned_candidates[0] if cleaned_candidates else '' elif exact_match: simplified_response["WFO_candidate_names"] = exact_match.get("full_name_plain") simplified_response["WFO_best_match"] = exact_match.get("full_name_plain") else: simplified_response["WFO_candidate_names"] = '' simplified_response["WFO_best_match"] = '' # Call WFO again to update placement using WFO_best_match try: response_placement = self.query_wfo_name_matching(simplified_response["WFO_best_match"]) placement_exact_match = response_placement.get("match") simplified_response["WFO_placement"] = placement_exact_match.get("placement", '') except: simplified_response["WFO_placement"] = '' return simplified_response, ranked_candidates def _rank_candidates_by_similarity(self, query, candidates): string_similarities = [] fuzzy_similarities = {candidate: fuzz.ratio(query, candidate) for candidate in candidates} query_words = query.split() for candidate in candidates: candidate_words = candidate.split() # Calculate word similarities and sum them up word_similarities = [ratio(query_word, candidate_word) for query_word, candidate_word in zip(query_words, candidate_words)] total_word_similarity = sum(word_similarities) # Calculate combined similarity score (average of word and fuzzy similarities) fuzzy_similarity = fuzzy_similarities[candidate] combined_similarity = (total_word_similarity + fuzzy_similarity) / 2 string_similarities.append((candidate, combined_similarity)) # Sort the candidates based on combined similarity, higher scores first ranked_candidates = sorted(string_similarities, key=lambda x: x[1], reverse=True) # Extracting only the candidate names from the top candidates top_candidates = ranked_candidates[:self.N_BEST_CANDIDATES] cleaned_candidates = [cand[0] for cand in top_candidates] return cleaned_candidates, ranked_candidates def check_WFO(self, record, replace_if_success_wfo): self.replace_if_success_wfo = replace_if_success_wfo # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement" simplified_response = self.query_and_process(record) simplified_response['WFO_override_OCR'] = False # best_match if simplified_response.get('WFO_exact_match'): simplified_response['WFO_exact_match_name'] = simplified_response.get('WFO_best_match') else: simplified_response['WFO_exact_match_name'] = '' # placement wfo_placement = simplified_response.get('WFO_placement', '') if wfo_placement: parts = wfo_placement.split('/')[1:] simplified_response['WFO_placement'] = self.SEP.join(parts) else: simplified_response['WFO_placement'] = '' if simplified_response.get('WFO_exact_match') and replace_if_success_wfo: simplified_response['WFO_override_OCR'] = True name_parts = simplified_response.get('WFO_placement').split('$')[0] name_parts = name_parts.split(self.SEP) record['order'] = name_parts[3] record['family'] = name_parts[4] record['genus'] = name_parts[5] record['specificEpithet'] = name_parts[6] record['scientificName'] = simplified_response.get('WFO_exact_match_name') return record, simplified_response def validate_taxonomy_WFO(record_dict, replace_if_success_wfo=False): Matcher = WFONameMatcher() try: record_dict, WFO_dict = Matcher.check_WFO(record_dict, replace_if_success_wfo) return record_dict, WFO_dict except: return record_dict, Matcher.NULL_DICT ''' if __name__ == "__main__": Matcher = WFONameMatcher() # input_string = "Rhopalocarpus alterfolius" record_exact_match ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "Hypericum prolificum", "scientificNameAuthorship": "", "genus": "Hypericum", "subgenus": "", "specificEpithet": "prolificum", "infraspecificEpithet": "", } record_partialPrimary_exactSecondary ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "Hyperic prolificum", "scientificNameAuthorship": "", "genus": "Hypericum", "subgenus": "", "specificEpithet": "prolificum", "infraspecificEpithet": "", } record_exactPrimary_partialSecondary ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "Hypericum prolificum", "scientificNameAuthorship": "", "genus": "Hyperic", "subgenus": "", "specificEpithet": "prolificum", "infraspecificEpithet": "", } record_partialPrimary_partialSecondary ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "Hyperic prolificum", "scientificNameAuthorship": "", "genus": "Hypericum", "subgenus": "", "specificEpithet": "prolific", "infraspecificEpithet": "", } record_partialPrimary_partialSecondary_swap ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "Hypericum prolific", "scientificNameAuthorship": "", "genus": "Hyperic", "subgenus": "", "specificEpithet": "prolificum", "infraspecificEpithet": "", } record_errorPrimary_partialSecondary ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "ricum proli", "scientificNameAuthorship": "", "genus": "Hyperic", "subgenus": "", "specificEpithet": "prolificum", "infraspecificEpithet": "", } record_partialPrimary_errorSecondary ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "Hyperic prolificum", "scientificNameAuthorship": "", "genus": "ricum", "subgenus": "", "specificEpithet": "proli", "infraspecificEpithet": "", } record_errorPrimary_errorSecondary ={ "order": "Malpighiales", "family": "Hypericaceae", "scientificName": "ricum proli", "scientificNameAuthorship": "", "genus": "ricum", "subgenus": "", "specificEpithet": "proli", "infraspecificEpithet": "", } options = [record_exact_match, record_partialPrimary_exactSecondary, record_exactPrimary_partialSecondary, record_partialPrimary_partialSecondary, record_partialPrimary_partialSecondary_swap, record_errorPrimary_partialSecondary, record_partialPrimary_errorSecondary, record_errorPrimary_errorSecondary] for opt in options: simplified_response = Matcher.check_WFO(opt) print(json.dumps(simplified_response, indent=4)) '''