Spaces:

MVPilgrim
/

SemanticSearchPOC

Sleeping

SemanticSearchPOC / modules /text2vec-contextionary /vectorizer /inspector.go

KevinStephenson

Adding in weaviate code

b110593 over 1 year ago

6.88 kB

	// _ _
	// __ _____ __ ___ ___ __ _\| \|_ ___
	// \ \ /\ / / _ \/ _` \ \ / / \|/ _` \| __/ _ \
	// \ V V / __/ (_\| \|\ V /\| \| (_\| \| \|\| __/
	// \_/\_/ \___\|\__,_\| \_/ \|_\|\__,_\|\__\___\|
	//
	// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
	//
	// CONTACT: hello@weaviate.io
	//

	package vectorizer

	import (
	"context"
	"fmt"
	"strings"
	"unicode"
	"unicode/utf8"

	"github.com/weaviate/weaviate/entities/models"
	txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models"
	)

	type InspectorClient interface {
	VectorForWord(ctx context.Context, word string) ([]float32, error)
	VectorForCorpi(ctx context.Context, words []string,
	overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error)
	NearestWordsByVector(ctx context.Context, vector []float32, n int, k int) ([]string, []float32, error)
	IsWordPresent(ctx context.Context, word string) (bool, error)
	}

	type Inspector struct {
	client InspectorClient
	}

	func NewInspector(client InspectorClient) *Inspector {
	return &Inspector{client: client}
	}

	func (i Inspector) GetWords(ctx context.Context, words string) (models.C11yWordsResponse, error) {
	wordArray, err := i.validateAndSplit(words)
	if err != nil {
	return nil, err
	}

	concatWord, err := i.concatWord(ctx, words, wordArray)
	if err != nil {
	return nil, err
	}

	individualWords, err := i.individualWords(ctx, wordArray)
	if err != nil {
	return nil, err
	}

	return &models.C11yWordsResponse{
	ConcatenatedWord: concatWord,
	IndividualWords: individualWords,
	}, nil
	}

	func (i *Inspector) validateAndSplit(words string) ([]string, error) {
	// set first character to lowercase
	wordChars := []rune(words)
	wordChars[0] = unicode.ToLower(wordChars[0])
	words = string(wordChars)

	for _, r := range words {
	if !unicode.IsLetter(r) && !unicode.IsNumber(r) {
	return nil, fmt.Errorf("invalid word input: words must only contain unicode letters and digits")
	}
	}

	return split(words), nil
	}

	func (i *Inspector) concatWord(ctx context.Context, words string,
	wordArray []string,
	) (*models.C11yWordsResponseConcatenatedWord, error) {
	if len(wordArray) < 2 {
	// only build a concat response if we have more than a single word
	return nil, nil
	}

	// join the words into a single corpus. While the contextionary also supports
	// building a centroid from multiple corpi (thus []string for Corpi, an
	// occurrence-based weighing can only happen within a corpus. It is thus - by
	// far - preferable in this case, to concat the words into one corpus, rather
	// than treating each word as its own.
	corpus := strings.Join(wordArray, " ")
	vector, _, err := i.client.VectorForCorpi(ctx, []string{corpus}, nil)
	if err != nil {
	return nil, err
	}

	nearestNeighbors, err := i.nearestNeighbors(ctx, vector)
	if err != nil {
	return nil, err
	}

	return &models.C11yWordsResponseConcatenatedWord{
	ConcatenatedWord: words,
	SingleWords: wordArray,
	ConcatenatedVector: vector,
	ConcatenatedNearestNeighbors: nearestNeighbors,
	}, nil
	}

	func (i *Inspector) nearestNeighbors(ctx context.Context,
	vector []float32,
	) ([]*models.C11yNearestNeighborsItems0, error) {
	// relate words of centroid
	words, dists, err := i.client.NearestWordsByVector(ctx, vector, 12, 32)
	if err != nil {
	return nil, err
	}

	nearestNeighbors := []*models.C11yNearestNeighborsItems0{}

	// loop over NN Idx' and append to the return object
	for i, word := range words {
	item := models.C11yNearestNeighborsItems0{
	Word: word,
	Distance: dists[i],
	}

	nearestNeighbors = append(nearestNeighbors, &item)
	}

	return nearestNeighbors, nil
	}

	func (i *Inspector) individualWords(ctx context.Context,
	wordArray []string,
	) ([]*models.C11yWordsResponseIndividualWordsItems0, error) {
	var res []*models.C11yWordsResponseIndividualWordsItems0

	for _, word := range wordArray {
	iw, err := i.individualWord(ctx, word)
	if err != nil {
	return nil, fmt.Errorf("word '%s': %v", word, err)
	}

	res = append(res, iw)
	}

	return res, nil
	}

	func (i *Inspector) individualWord(ctx context.Context,
	word string,
	) (*models.C11yWordsResponseIndividualWordsItems0, error) {
	ok, err := i.client.IsWordPresent(ctx, word)
	if err != nil {
	return nil, fmt.Errorf("could not check word presence: %v", err)
	}

	if !ok {
	return i.individualWordNotPresent(word), nil
	}

	return i.individualWordPresent(ctx, word)
	}

	func (i Inspector) individualWordNotPresent(word string) models.C11yWordsResponseIndividualWordsItems0 {
	return &models.C11yWordsResponseIndividualWordsItems0{
	Word: word,
	Present: false,
	}
	}

	func (i *Inspector) individualWordPresent(ctx context.Context,
	word string,
	) (*models.C11yWordsResponseIndividualWordsItems0, error) {
	info, err := i.individualWordInfo(ctx, word)
	if err != nil {
	return nil, err
	}

	return &models.C11yWordsResponseIndividualWordsItems0{
	Word: word,
	Present: true,
	Info: info,
	}, nil
	}

	func (i *Inspector) individualWordInfo(ctx context.Context,
	word string,
	) (*models.C11yWordsResponseIndividualWordsItems0Info, error) {
	vector, err := i.client.VectorForWord(ctx, word)
	if err != nil {
	return nil, err
	}

	nns, err := i.nearestNeighbors(ctx, vector)
	if err != nil {
	return nil, err
	}

	return &models.C11yWordsResponseIndividualWordsItems0Info{
	Vector: vector,
	NearestNeighbors: nns,
	}, nil
	}

	// Splits a CamelCase string to an array
	// Based on: https://github.com/fatih/camelcase
	func split(src string) (entries []string) {
	// don't split invalid utf8
	if !utf8.ValidString(src) {
	return []string{src}
	}
	entries = []string{}
	var runes [][]rune
	lastClass := 0
	class := 0
	// split into fields based on class of unicode character
	for _, r := range src {
	switch true {
	case unicode.IsLower(r):
	class = 1
	case unicode.IsUpper(r):
	class = 2
	case unicode.IsDigit(r):
	class = 1
	default:
	class = 4
	}
	if class == lastClass {
	runes[len(runes)-1] = append(runes[len(runes)-1], r)
	} else {
	runes = append(runes, []rune{r})
	}
	lastClass = class
	}
	// handle upper case -> lower case sequences, e.g.
	// "PDFL", "oader" -> "PDF", "Loader"
	for i := 0; i < len(runes)-1; i++ {
	if unicode.IsUpper(runes[i][0]) && unicode.IsLower(runes[i+1][0]) {
	runes[i+1] = append([]rune{runes[i][len(runes[i])-1]}, runes[i+1]...)
	runes[i] = runes[i][:len(runes[i])-1]
	}
	}
	// construct []string from results
	for _, s := range runes {
	if len(s) > 0 {
	entries = append(entries, strings.ToLower(string(s)))
	}
	}
	return
	}