Spaces:

KalbeDigitalLab
/

NutriGenMePE

Sleeping

File size: 3,172 Bytes

eb88b82

import os
import shutil
import textwrap

import nltk
import re
from Bio import Entrez


def replace_quotes(text):
    pattern = r'(?<=")[^"]*(?=")'
    return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)


def clean_text(text):
    """Remove section titles and figure descriptions from text"""
    pattern = r'[^\w\s]'
    clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
    return re.sub(pattern, '', clean)


def truncate_text(text, max_tokens):
    wrapper = textwrap.TextWrapper(width=max_tokens)
    truncated_text = wrapper.wrap(text)
    if len(truncated_text) > 0:
        return truncated_text[0]
    else:
        return ""


def split_text(text, chunk_size):
    chunks = []
    start = 0
    end = chunk_size
    while start < len(text):
        chunks.append(text[start:end])
        start = end
        end += chunk_size
    return chunks


def extract_gene_name(text):

    text_str = text.decode("utf-8")
    text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
    pattern = r"<NAME>(.*?)</NAME>"
    match = re.search(pattern, text_str)
    if match:
        gene_name = match.group(1)
        return gene_name
    else:
        return None


def get_geneName(rsid):

    text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
    text = extract_gene_name(text)
    return text


def split_text_into_sentences(text, num_sentences):

    sentences = nltk.sent_tokenize(text)
    grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
    return grouped_sentences


def flatten_list(nested_list):

    flattened_list = []
    for item in nested_list:
        if isinstance(item, list):
            flattened_list.extend(flatten_list(item))
        else:
            flattened_list.append(item)
    return flattened_list


def move_file(source_path, destination_path):

    if not os.path.exists(destination_path):
        os.makedirs(destination_path)

    try:
        shutil.move(source_path, destination_path)
        print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
    except Exception as e:
        print(f"Error: {e}")


def upper_abbreviation(text):
    pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b'
    pattern2 = re.compile(r'unknown', re.IGNORECASE)
    def convert_to_upper(match):
        return match.group(0).replace('.', '').upper()
    text = re.sub(pattern2, '', text)
    output_string = re.sub(pattern1, convert_to_upper, text)
    return output_string


def get_valid_year(input_text):
    four_letter_words = re.findall(r'\b\w{4}\b', input_text)
    result_text = ' '.join(four_letter_words)
    if len(result_text.split(' ')) > 1:
        return ''.join(result_text.split(' ')[0])
    return result_text


def sample_size_postproc(text):
    words = text.split()
    pattern = r'\b[A-Za-z]+\d+\b'
    cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)]
    cleaned_text = ' '.join(cleaned_words)
    cleaned_text = re.sub(pattern, '', cleaned_text)
    return cleaned_text