Spaces:

Iker
/

ClickbaitFighter

Running on Zero

File size: 3,809 Bytes

from collections import OrderedDict
from datetime import datetime
from typing import Optional


class CacheHandler:
    def __init__(self, max_cache_size: int = 1000):
        # Using OrderedDict to maintain the order of insertion for efficient removal of oldest items
        self.cache = OrderedDict()
        self.cache["https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"] = {
            "title": "Iker García-Ferrero | Personal Webpage",
            "date": datetime.now(),
            "summary_0": "Iker García-Ferrero es un candidato a PhD en Natural Language Processing (NLP) "
            "en la Universidad del País Vasco UPV/EHU, IXA Group y HiTZ Centro Vasco de Tecnología de la "
            "Lengua, financiado por una beca del Gobierno Vasco. "
            "En el pasado, ha realizado prácticas en Amazon y ha realizado una estancia "
            "de investigación en la Universidad de Pensilvania (EEUU)."
            "Sus investigaciones se centran en la creación de modelos y recursos para NLP en "
            "lenguas con pocos o ningún recurso disponible, utilizando técnicas de transferencia de "
            "datos y modelos. Recientemente también se ha especializado en el entrenamiento de LLMs",
            "summary_50": "Iker García-Ferrero es un candidato a PhD en NLP en la Universidad del País Vasco, "
            "con experiencia en Amazon, la Universidad de Pensilvania e HiTZ.",
            "summary_100": "Iker García-Ferrero es un candidato a PhD en NLP.",
        }
        self.max_cache_size = max_cache_size
        self.misses = 0
        self.hits = 0

    def add_to_cache(
        self, url: str, title: str, text: str, summary_type: int, summary: str
    ):
        # If URL already exists, update it and move it to the end to mark it as the most recently used
        if url in self.cache:
            self.cache.move_to_end(url)
            self.cache[url][f"summary_{summary_type}"] = summary
            self.cache[url]["date"] = datetime.now()
        else:
            # Add new entry to the cache
            self.cache[url] = {
                "title": title,
                "text": text,
                "date": datetime.now(),
                "summary_0": summary if summary_type == 0 else None,
                "summary_50": summary if summary_type == 50 else None,
                "summary_100": summary if summary_type == 100 else None,
            }
            # Remove the oldest item if cache exceeds max size
            if len(self.cache) > self.max_cache_size:
                self.cache.move_to_end(
                    "https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"
                )  # This is the default value in the demo, so we don't want to remove it
                self.cache.popitem(last=False)  # pop the oldest item

    def get_from_cache(
        self, url: str, summary_type: int, second_try: bool = False
    ) -> Optional[tuple]:
        if url in self.cache and self.cache[url][f"summary_{summary_type}"] is not None:
            # Move the accessed item to the end to mark it as recently used
            self.cache.move_to_end(url)
            self.hits += 1
            if second_try:
                # In the first try we didn't get the cache hit, probably because it was a shortened URL
                # So me decrease the number of misses, because we got the cache hit in the end
                self.misses -= 1
            return (
                self.cache[url]["title"],
                self.cache[url]["text"],
                self.cache[url][f"summary_{summary_type}"],
            )
        else:
            if not second_try:
                self.misses += 1
            return None, None, None

    def get_cache_stats(self):
        return self.hits, self.misses, len(self.cache)