search-assistant / llmsearch /site_stats.py
arabellastrange's picture
fixed git
0e9148c
raw
history blame
3.38 kB
import json
import random
import traceback
from llmsearch import utilityV2 as ut
def findnth(haystack, needle, n):
parts = haystack.split(needle, n + 1)
if len(parts) <= n + 1:
return -1
return len(haystack) - len(parts[-1]) - len(needle)
def extract_site(url):
site = ""
base = findnth(url, "/", 2)
if base > 2:
site = url[:base].split(".")
if len(site) > 1:
site = site[-2]
site = site.replace("https://", "")
site = site.replace("http://", "")
return site
site_stats = {} # initialize dictionay of sites used
stats_loaded = False
stats_dirty = False
def open_site_stats():
global site_stats, stats_loaded, stats_dirty
if stats_loaded:
return
try:
with open("site_stats.json", "r") as f:
site_stats = json.loads(f.read())
except:
print("Failed to read site_stats.json")
traceback.print_exc()
def ckpt():
global site_stats, stats_dirty
if not stats_dirty:
return
try:
with open("site_stats.json", "w") as ss:
ss.write(json.dumps(site_stats))
stats_dirty = False
except Exception as e:
print(f"Failed to write site_stats: {str(e)}")
traceback.print_exc()
def update_site_stats(site, char_cnt, get_time, extract_time, openai_time):
global site_stats, stats_dirty
open_site_stats()
if site not in site_stats:
site_stats[site] = {
"name": site,
"hits": 0,
"chars": 0,
"get": 0,
"extract": 0,
"openai": 0,
}
if "hits" not in site_stats[site]:
site_stats[site]["hits"] = 0
site_stats[site]["hits"] = site_stats[site]["hits"] + 1
site_stats[site]["chars"] = char_cnt + site_stats[site]["chars"]
site_stats[site]["get"] = get_time + site_stats[site]["get"]
site_stats[site]["extract"] = extract_time + site_stats[site]["extract"]
site_stats[site]["openai"] = openai_time + site_stats[site]["openai"]
stats_dirty = True
# print("updated", site_stats[site])
def retrieve(site):
global site_stats
if site not in site_stats:
site_stats[site] = {
"name": site,
"hits": 0,
"chars": 0,
"get": 0,
"extract": 0,
"openai": 0,
}
return site_stats[site]
def get_next(urls, sample_unknown=False):
global site_stats
# retrieve stats for sites in list
candidates = []
for url in urls:
site = extract_site(url)
candidate = retrieve(site)
if sample_unknown or (site in ut.sites and ut.sites[site] != 0):
candidates.append((candidate, url))
if len(candidates) == 0:
return []
if len(candidates) == 1:
return candidates[0]
# random or ordered? if random, pick without sorting
if random.random() > 0.85:
pick = int(random.random() * len(candidates))
return candidates[pick]
# ordered, sort and compute cumulative
candidates.sort(
reverse=True,
key=lambda item: (
(item[0]["chars"] * 1000000)
/ (max(1000, item[0]["get"] + item[0]["extract"] + item[0]["openai"]))
),
)
# now pick top from sort
p = random.random()
p2 = p * p * p
return candidates[int(p2 * len(candidates))]