Spaces:

madebybread
/

brightly-ai

Paused

File size: 2,553 Bytes

c8f71ad
9189e38
 
 
313433a
9189e38
b1c94e2
 
 
8e06613
 
 
 
 
 
b1c94e2
 
 
 
 
 
 
 
 
 
 
306cc03
b1c94e2
306cc03
b1c94e2
 
 
 
 
 
306cc03
b1c94e2
 
 
 
306cc03
 
 
b1c94e2
 
9189e38
b1c94e2
 
 
ecfb899
b1c94e2
306cc03
b1c94e2
 
 
fb1ac97
 
306cc03
ecfb899
b1c94e2
 
 
 
 
 
 
 
 
 
 
 
 
 
7450395
ecfb899
b1c94e2

import spacy
import re

# Load the spaCy model
nlp = spacy.load("en_core_web_trf")

def analyze_text(text):
    # Replace different delimiters with a uniform delimiter (comma)
    normalized_text = re.sub(r'[\/,]', ',', text)

    # an ampersand with spaces on both sides is a delimiter
    normalized_text = re.sub(r'\s*&\s*', ',', normalized_text)

    # the word 'and' with spaces on both sides is a delimiter
    normalized_text = re.sub(r'\s+and\s+', ',', normalized_text)
    
    doc = nlp(normalized_text)
    
    # Print tokens with their attributes
    for token in doc:
        print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")
    
    items = []
    current_item = []
    
    for token in doc:
        # If the token is punctuation, finalize the current item
        if token.pos_ == 'PUNCT' and token.text == ',':
            if current_item:
                items.append(" ".join(current_item))
                current_item = []
        else:
            # If token is part of a compound noun or an adjective, add to the current item
            if token.dep_ in ('compound', 'amod'):
                current_item.append(token.text)
            elif token.dep_ in ('ROOT', 'appos'):
                if current_item:
                    current_item.append(token.text)
                else:
                    current_item = [token.text]
                if token.head.dep_ == 'ROOT':
                    items.append(" ".join(current_item))
                    current_item = []
            else:
                current_item.append(token.text)

    # Add the last item if it exists
    if current_item:
        items.append(" ".join(current_item))

    # Determine if the text is a single noun phrase or multiple items
    is_single_noun_phrase = len(items) == 1

    delimiter = determine_delimiter(text)

    items = [item.strip() for item in items]

    return is_single_noun_phrase, delimiter, items
    
def determine_delimiter(text):
    number_of_slashes = text.count('/')
    number_of_commas = text.count(',')
    number_of_spaces = text.count(' ')

    if number_of_slashes > 0 and number_of_slashes >= number_of_commas:
        # prefer slash over comma, since its rarer
        return '/'
    elif number_of_commas > 0:
        return ','
    else:
        return ' '

def extract_items(text):
    is_single_noun_phrase, delimiter, _ = analyze_text(text)
    
    if is_single_noun_phrase:
        return [text]
    else:
        items = text.split(delimiter)
        return items