Chris4K commited on
Commit
9b48522
1 Parent(s): 77d7782

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -3
app.py CHANGED
@@ -54,7 +54,35 @@ model_pipeline = pipeline(
54
  )
55
 
56
  # Use the pipeline in HuggingFacePipeline
57
- llm = HuggingFacePipeline(pipeline=model_pipeline)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # NLTK Resource Download
60
  def download_nltk_resources():
@@ -93,7 +121,7 @@ class ModelManager:
93
  }
94
 
95
 
96
- def update_model_ranking(self, model_id: str, score: float, feedback: Optional[str] = None):
97
  """Update model ranking based on performance and optional feedback"""
98
  current_score = self.rankings.get(model_id, 0.0)
99
  # Weighted average of current score and new score
@@ -361,6 +389,8 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
361
  return RecursiveCharacterTextSplitter(
362
  chunk_size=chunk_size,
363
  chunk_overlap=overlap_size,
 
 
364
  separators=custom_separators or ["\n\n", "\n", " ", ""]
365
  )
366
  else:
@@ -369,7 +399,12 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
369
  def get_embedding_model(model_type, model_name):
370
  model_path = model_manager.get_model(model_type, model_name)
371
  if model_type == 'HuggingFace':
372
- return HuggingFaceEmbeddings(model_name=model_path)
 
 
 
 
 
373
  elif model_type == 'OpenAI':
374
  return OpenAIEmbeddings(model=model_path)
375
  elif model_type == 'Cohere':
@@ -605,6 +640,15 @@ def visualize_results(results_df, stats_df):
605
 
606
  plt.tight_layout()
607
  return fig
 
 
 
 
 
 
 
 
 
608
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
609
  tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
610
 
 
54
  )
55
 
56
  # Use the pipeline in HuggingFacePipeline
57
+ #llm = HuggingFacePipeline(pipeline=model_pipeline)
58
+
59
+ ##### Alternative
60
+ from transformers import pipeline
61
+ import torch
62
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
63
+
64
+ READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
65
+
66
+ bnb_config = BitsAndBytesConfig(
67
+ load_in_4bit=True,
68
+ bnb_4bit_use_double_quant=True,
69
+ bnb_4bit_quant_type="nf4",
70
+ bnb_4bit_compute_dtype=torch.bfloat16,
71
+ )
72
+ rmodel = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
73
+ tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
74
+
75
+ llm = pipeline(
76
+ model=rmodel,
77
+ tokenizer=tokenizer,
78
+ task="text-generation",
79
+ do_sample=True,
80
+ temperature=0.2,
81
+ repetition_penalty=1.1,
82
+ return_full_text=False,
83
+ max_new_tokens=500,
84
+ )
85
+
86
 
87
  # NLTK Resource Download
88
  def download_nltk_resources():
 
121
  }
122
 
123
 
124
+ def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
125
  """Update model ranking based on performance and optional feedback"""
126
  current_score = self.rankings.get(model_id, 0.0)
127
  # Weighted average of current score and new score
 
389
  return RecursiveCharacterTextSplitter(
390
  chunk_size=chunk_size,
391
  chunk_overlap=overlap_size,
392
+ add_start_index=True, # If `True`, includes chunk's start index in metadata
393
+ strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
394
  separators=custom_separators or ["\n\n", "\n", " ", ""]
395
  )
396
  else:
 
399
  def get_embedding_model(model_type, model_name):
400
  model_path = model_manager.get_model(model_type, model_name)
401
  if model_type == 'HuggingFace':
402
+ return = HuggingFaceEmbeddings(
403
+ model_name=model_path,
404
+ multi_process=True,
405
+ model_kwargs={"device": "cuda"},
406
+ #encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
407
+ )
408
  elif model_type == 'OpenAI':
409
  return OpenAIEmbeddings(model=model_path)
410
  elif model_type == 'Cohere':
 
640
 
641
  plt.tight_layout()
642
  return fig
643
+
644
+
645
+ #tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
646
+ #lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
647
+ #fig = pd.Series(lengths).hist()
648
+ #plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
649
+ #plt.show()
650
+
651
+
652
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
653
  tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
654