from collections import OrderedDict from datetime import datetime from typing import Optional class CacheHandler: def __init__(self, max_cache_size: int = 1000): # Using OrderedDict to maintain the order of insertion for efficient removal of oldest items self.cache = OrderedDict() self.cache["https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"] = { "title": "Iker García-Ferrero | Personal Webpage", "date": datetime.now(), "text": """I am currently a PhD candidate specializing in Natural Language Processing (NLP) at the University of the Basque Country UPV/EHU, IXA Group, and HiTZ Basque Center for Language Technologies, funded by a grant from the Basque Government. My advisors are German Rigau and Rodrigo Agerri. I anticipate concluding my PhD by early 2024. My previous experiences include an internship as an Applied Scientist at Amazon Barcelona, where I was part of Lluis Marquez's team. I also served as Visiting Associate for 4 months at the School of Engineering and Applied Science, Department of Computer and Information Science, Cognitive Computation Group at the University of Pennsylvania under the supervision of Dan Roth. My research primarily focuses on Multilingual Natural Language Processing. I aim to develop deep learning models and resources that enable NLP in languages with limited or no available resources. This research branches in two directions. First, data-transfer methods for which I have developed state-of-the-art techniques to automatically generate annotated data for languages that lack these resources. Second, model-transfer methods, a field in which I've made significant contributions to improve the zero-shot cross-lingual performance of NLP models. Recently, my research has branched into training Large Language Models (LLMs) for various tasks and domains. The most notable ones being GoLLIE a 34B parameter LLM which achieves state-of-the-art results for zero-shot Information Extraction, and MedMT5, the first open-source text-to-text multilingual model for the medical domain. """, "summary_0": "Iker García-Ferrero es un candidato a PhD en Natural Language Processing (NLP) " "en la Universidad del País Vasco UPV/EHU, IXA Group y HiTZ Centro Vasco de Tecnología de la " "Lengua, financiado por una beca del Gobierno Vasco. " "En el pasado, ha realizado prácticas en Amazon y ha realizado una estancia " "de investigación en la Universidad de Pensilvania (EEUU)." "Sus investigaciones se centran en la creación de modelos y recursos para NLP en " "lenguas con pocos o ningún recurso disponible, utilizando técnicas de transferencia de " "datos y modelos. Recientemente también se ha especializado en el entrenamiento de LLMs", "summary_50": "Iker García-Ferrero es un candidato a PhD en NLP en la Universidad del País Vasco, " "con experiencia en Amazon, la Universidad de Pensilvania e HiTZ.", "summary_100": "Iker García-Ferrero es un candidato a PhD en NLP.", } self.max_cache_size = max_cache_size self.misses = 0 self.hits = 0 def add_to_cache( self, url: str, title: str, text: str, summary_type: int, summary: str ): # If URL already exists, update it and move it to the end to mark it as the most recently used if url in self.cache: self.cache.move_to_end(url) self.cache[url][f"summary_{summary_type}"] = summary self.cache[url]["date"] = datetime.now() else: # Add new entry to the cache self.cache[url] = { "title": title, "text": text, "date": datetime.now(), "summary_0": summary if summary_type == 0 else None, "summary_50": summary if summary_type == 50 else None, "summary_100": summary if summary_type == 100 else None, } # Remove the oldest item if cache exceeds max size if len(self.cache) > self.max_cache_size: self.cache.move_to_end( "https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/" ) # This is the default value in the demo, so we don't want to remove it self.cache.popitem(last=False) # pop the oldest item def get_from_cache( self, url: str, summary_type: int, second_try: bool = False ) -> Optional[tuple]: if url in self.cache and self.cache[url][f"summary_{summary_type}"] is not None: # Move the accessed item to the end to mark it as recently used self.cache.move_to_end(url) self.hits += 1 if second_try: # In the first try we didn't get the cache hit, probably because it was a shortened URL # So me decrease the number of misses, because we got the cache hit in the end self.misses -= 1 return ( self.cache[url]["title"], self.cache[url]["text"], self.cache[url][f"summary_{summary_type}"], ) else: if not second_try: self.misses += 1 return None, None, None def get_cache_stats(self): return self.hits, self.misses, len(self.cache)