ClickbaitFighter / cache_system.py
Iker's picture
Fix error
eb10d48 verified
from collections import OrderedDict
from datetime import datetime
from typing import Optional
class CacheHandler:
def __init__(self, max_cache_size: int = 1000):
# Using OrderedDict to maintain the order of insertion for efficient removal of oldest items
self.cache = OrderedDict()
self.cache["https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"] = {
"title": "Iker García-Ferrero | Personal Webpage",
"date": datetime.now(),
"text": """I am currently a PhD candidate specializing in Natural Language Processing (NLP) at the University of the Basque Country UPV/EHU, IXA Group, and HiTZ Basque Center for Language Technologies, funded by a grant from the Basque Government. My advisors are German Rigau and Rodrigo Agerri. I anticipate concluding my PhD by early 2024.
My previous experiences include an internship as an Applied Scientist at Amazon Barcelona, where I was part of Lluis Marquez's team. I also served as Visiting Associate for 4 months at the School of Engineering and Applied Science, Department of Computer and Information Science, Cognitive Computation Group at the University of Pennsylvania under the supervision of Dan Roth.
My research primarily focuses on Multilingual Natural Language Processing. I aim to develop deep learning models and resources that enable NLP in languages with limited or no available resources. This research branches in two directions. First, data-transfer methods for which I have developed state-of-the-art techniques to automatically generate annotated data for languages that lack these resources. Second, model-transfer methods, a field in which I've made significant contributions to improve the zero-shot cross-lingual performance of NLP models. Recently, my research has branched into training Large Language Models (LLMs) for various tasks and domains. The most notable ones being GoLLIE a 34B parameter LLM which achieves state-of-the-art results for zero-shot Information Extraction, and MedMT5, the first open-source text-to-text multilingual model for the medical domain.
""",
"summary_0": "Iker García-Ferrero es un candidato a PhD en Natural Language Processing (NLP) "
"en la Universidad del País Vasco UPV/EHU, IXA Group y HiTZ Centro Vasco de Tecnología de la "
"Lengua, financiado por una beca del Gobierno Vasco. "
"En el pasado, ha realizado prácticas en Amazon y ha realizado una estancia "
"de investigación en la Universidad de Pensilvania (EEUU)."
"Sus investigaciones se centran en la creación de modelos y recursos para NLP en "
"lenguas con pocos o ningún recurso disponible, utilizando técnicas de transferencia de "
"datos y modelos. Recientemente también se ha especializado en el entrenamiento de LLMs",
"summary_50": "Iker García-Ferrero es un candidato a PhD en NLP en la Universidad del País Vasco, "
"con experiencia en Amazon, la Universidad de Pensilvania e HiTZ.",
"summary_100": "Iker García-Ferrero es un candidato a PhD en NLP.",
}
self.max_cache_size = max_cache_size
self.misses = 0
self.hits = 0
def add_to_cache(
self, url: str, title: str, text: str, summary_type: int, summary: str
):
# If URL already exists, update it and move it to the end to mark it as the most recently used
if url in self.cache:
self.cache.move_to_end(url)
self.cache[url][f"summary_{summary_type}"] = summary
self.cache[url]["date"] = datetime.now()
else:
# Add new entry to the cache
self.cache[url] = {
"title": title,
"text": text,
"date": datetime.now(),
"summary_0": summary if summary_type == 0 else None,
"summary_50": summary if summary_type == 50 else None,
"summary_100": summary if summary_type == 100 else None,
}
# Remove the oldest item if cache exceeds max size
if len(self.cache) > self.max_cache_size:
self.cache.move_to_end(
"https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"
) # This is the default value in the demo, so we don't want to remove it
self.cache.popitem(last=False) # pop the oldest item
def get_from_cache(
self, url: str, summary_type: int, second_try: bool = False
) -> Optional[tuple]:
if url in self.cache and self.cache[url][f"summary_{summary_type}"] is not None:
# Move the accessed item to the end to mark it as recently used
self.cache.move_to_end(url)
self.hits += 1
if second_try:
# In the first try we didn't get the cache hit, probably because it was a shortened URL
# So me decrease the number of misses, because we got the cache hit in the end
self.misses -= 1
return (
self.cache[url]["title"],
self.cache[url]["text"],
self.cache[url][f"summary_{summary_type}"],
)
else:
if not second_try:
self.misses += 1
return None, None, None
def get_cache_stats(self):
return self.hits, self.misses, len(self.cache)