from flask import Flask, jsonify, request, render_template import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer, util import torch import re app = Flask(__name__) def extract_embeddings(embeddings_str): pattern = r'(-?\d+(?:\.\d+)?(?:[eE]-?\d+)?)' matches = re.findall(pattern, embeddings_str) return list(map(float, matches)) df = pd.read_excel("ebd4appdom.xlsx") embedder = SentenceTransformer('all-MiniLM-L6-v2') df['Embeddings'] = df['Embeddings'].apply(extract_embeddings) descriptions_embeddings = list(df.Embeddings) patnums = list(df["Number"]) standards = list(df["Standards"]) urls = list(df["URL"]) descriptions = list(df.Description) def split_string(s, max_len, overlap, min_words_count=0): words = s.split() substrings = [] start = 0 while start + max_len < len(words): end = start + max_len substring = " ".join(words[start:end]) substrings.append(substring) start = end - overlap substrings.append(" ".join(words[start:])) long_substrings = [] for string in substrings: if len(string.split()) > min_words_count: long_substrings.append(string) return long_substrings @app.route('/', methods=['GET', 'POST']) def index(): if request.method == 'POST': query = request.form['query'] user_samples = split_string(query, 80, 3) top_k = min(5, len(descriptions)) results = [] cpt=0 for user_sample in user_samples: sp=[[user_sample, 'sample' + str(cpt)]] sample_embedding = embedder.encode(user_sample, convert_to_tensor=True) cos_scores = util.cos_sim(sample_embedding, descriptions_embeddings)[0] top_results = torch.topk(cos_scores, top_k) for score, idx in zip(top_results[0], top_results[1]): my_dict = dict(score= round(float(score.item()), 4), standards=standards[idx], desc=descriptions[idx], url=urls[idx]) sp.append(my_dict) results.append(sp) cpt += 1 return render_template('index.html', results=results) else: return render_template('index.html', results=None) if __name__ == '__main__': app.run(host="0.0.0.0", port=7860)