import random
import sys
from typing import Dict
from typing import List

import numpy as np
import supervision as sv
import torch
import torchvision
import torchvision.transforms as T
from huggingface_hub import hf_hub_download
from PIL import Image
from segment_anything import SamPredictor

# segment anything

sys.path.append("tag2text")
sys.path.append("GroundingDINO")

from groundingdino.models import build_model
from groundingdino.util.inference import Model as DinoModel
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
from tag2text.inference import inference as tag2text_inference


def load_model_hf(repo_id, filename, ckpt_config_filename, device="cpu"):
    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)

    args = SLConfig.fromfile(cache_config_file)
    args.device = device
    model = build_model(args)

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location=device)
    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    model.eval()
    return model


def download_file_hf(repo_id, filename, cache_dir="./cache"):
    cache_file = hf_hub_download(
        repo_id=repo_id, filename=filename, force_filename=filename, cache_dir=cache_dir
    )
    return cache_file


def transform_image_tag2text(image_pil: Image) -> torch.Tensor:
    transform = T.Compose(
        [
            T.Resize((384, 384)),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image = transform(image_pil)  # 3, h, w
    return image


def show_anns_sam(anns: List[Dict]):
    """Extracts the mask annotations from the Segment Anything model output and plots them.
    https://github.com/facebookresearch/segment-anything.

    Arguments:
      anns (List[Dict]): Segment Anything model output.

    Returns:
      (np.ndarray): Masked image.
      (np.ndarray): annotation encoding from https://github.com/LUSSeg/ImageNet-S
    """
    if len(anns) == 0:
        return
    sorted_anns = sorted(anns, key=(lambda x: x["area"]), reverse=True)
    full_img = None

    # for ann in sorted_anns:
    for i in range(len(sorted_anns)):
        ann = anns[i]
        m = ann["segmentation"]
        if full_img is None:
            full_img = np.zeros((m.shape[0], m.shape[1], 3))
            map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
        map[m != 0] = i + 1
        color_mask = np.random.random((1, 3)).tolist()[0]
        full_img[m != 0] = color_mask
    full_img = full_img * 255

    # anno encoding from https://github.com/LUSSeg/ImageNet-S
    res = np.zeros((map.shape[0], map.shape[1], 3))
    res[:, :, 0] = map % 256
    res[:, :, 1] = map // 256
    res.astype(np.float32)
    full_img = np.uint8(full_img)
    return full_img, res


def show_anns_sv(detections: sv.Detections):
    """Extracts the mask annotations from the Supervision Detections object.
    https://roboflow.github.io/supervision/detection/core/.

    Arguments:
      anns (sv.Detections): Containing information about the detections.

    Returns:
      (np.ndarray): Masked image.
      (np.ndarray): annotation encoding from https://github.com/LUSSeg/ImageNet-S
    """
    if detections.mask is None:
        return
    full_img = None

    for i in np.flip(np.argsort(detections.area)):
        m = detections.mask[i]
        if full_img is None:
            full_img = np.zeros((m.shape[0], m.shape[1], 3))
            map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
        map[m != 0] = i + 1
        color_mask = np.random.random((1, 3)).tolist()[0]
        full_img[m != 0] = color_mask
    full_img = full_img * 255

    # anno encoding from https://github.com/LUSSeg/ImageNet-S
    res = np.zeros((map.shape[0], map.shape[1], 3))
    res[:, :, 0] = map % 256
    res[:, :, 1] = map // 256
    res.astype(np.float32)
    full_img = np.uint8(full_img)
    return full_img, res


def generate_tags(tag2text_model, image, specified_tags, device="cpu"):
    """Generate image tags and caption using Tag2Text model.

    Arguments:
      tag2text_model (nn.Module): Tag2Text model to use for prediction.
      image (np.ndarray): The image for calculating. Expects an
        image in HWC uint8 format, with pixel values in [0, 255].
      specified_tags(str): User input specified tags

    Returns:
      (List[str]): Predicted image tags.
      (str): Predicted image caption
    """
    image = transform_image_tag2text(image).unsqueeze(0).to(device)
    res = tag2text_inference(image, tag2text_model, specified_tags)
    tags = res[0].split(" | ")
    caption = res[2]
    return tags, caption


def detect(
    grounding_dino_model: DinoModel,
    image: np.ndarray,
    caption: str,
    box_threshold: float = 0.3,
    text_threshold: float = 0.25,
    iou_threshold: float = 0.5,
    post_process: bool = True,
):
    """Detect bounding boxes for the given image, using the input caption.

    Arguments:
      grounding_dino_model (DinoModel): The model to use for detection.
      image (np.ndarray): The image for calculating masks. Expects an
        image in HWC uint8 format, with pixel values in [0, 255].
      caption (str): Input caption contain object names to detect. To detect multiple objects, seperating each name with '.', like this: cat . dog . chair
      box_threshold (float): Box confidence threshold
      text_threshold (float): Text confidence threshold
      iou_threshold (float): IOU score threshold for post processing
      post_process (bool): If True, run NMS algorithm to remove duplicates segments.

    Returns:
      (sv.Detections): Containing information about the detections in a video frame.
      (str): Predicted phrases.
      (List[str]): Predicted classes.
    """
    detections, phrases = grounding_dino_model.predict_with_caption(
        image=image,
        caption=caption,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
    )
    classes = list(map(lambda x: x.strip(), caption.split(".")))
    detections.class_id = DinoModel.phrases2classes(phrases=phrases, classes=classes)

    # NMS post process
    if post_process:
        # print(f"Before NMS: {len(detections.xyxy)} boxes")
        nms_idx = (
            torchvision.ops.nms(
                torch.from_numpy(detections.xyxy),
                torch.from_numpy(detections.confidence),
                iou_threshold,
            )
            .numpy()
            .tolist()
        )

        phrases = [phrases[idx] for idx in nms_idx]
        detections.xyxy = detections.xyxy[nms_idx]
        detections.confidence = detections.confidence[nms_idx]
        detections.class_id = detections.class_id[nms_idx]

        # print(f"After NMS: {len(detections.xyxy)} boxes")

    return detections, phrases, classes


def segment(sam_model: SamPredictor, image: np.ndarray, boxes: np.ndarray):
    """Predict masks for the given input boxes, using the currently set image.

    Arguments:
      sam_model (SamPredictor): The model to use for mask prediction.
      image (np.ndarray): The image for calculating masks. Expects an
        image in HWC uint8 format, with pixel values in [0, 255].
      boxes (np.ndarray or None): A Bx4 array given a box prompt to the
        model, in XYXY format.
      return_logits (bool): If true, returns un-thresholded masks logits
        instead of a binary mask.

    Returns:
      (torch.Tensor): The output masks in BxCxHxW format, where C is the
        number of masks, and (H, W) is the original image size.
      (torch.Tensor): An array of shape BxC containing the model's
        predictions for the quality of each mask.
      (torch.Tensor): An array of shape BxCxHxW, where C is the number
        of masks and H=W=256. These low res logits can be passed to
        a subsequent iteration as mask input.
    """
    sam_model.set_image(image)
    transformed_boxes = None
    if boxes is not None:
        boxes = torch.from_numpy(boxes)

        transformed_boxes = sam_model.transform.apply_boxes_torch(
            boxes.to(sam_model.device), image.shape[:2]
        )

    masks, scores, _ = sam_model.predict_torch(
        point_coords=None,
        point_labels=None,
        boxes=transformed_boxes,
        multimask_output=False,
    )
    masks = masks[:, 0, :, :]
    scores = scores[:, 0]
    return masks.cpu().numpy(), scores.cpu().numpy()


def draw_mask(mask, draw, random_color=False):
    if random_color:
        color = (
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(0, 255),
            153,
        )
    else:
        color = (30, 144, 255, 153)

    nonzero_coords = np.transpose(np.nonzero(mask))

    for coord in nonzero_coords:
        draw.point(coord[::-1], fill=color)