File size: 3,172 Bytes
eb88b82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import shutil
import textwrap

import nltk
import re
from Bio import Entrez


def replace_quotes(text):
    pattern = r'(?<=")[^"]*(?=")'
    return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)


def clean_text(text):
    """Remove section titles and figure descriptions from text"""
    pattern = r'[^\w\s]'
    clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
    return re.sub(pattern, '', clean)


def truncate_text(text, max_tokens):
    wrapper = textwrap.TextWrapper(width=max_tokens)
    truncated_text = wrapper.wrap(text)
    if len(truncated_text) > 0:
        return truncated_text[0]
    else:
        return ""


def split_text(text, chunk_size):
    chunks = []
    start = 0
    end = chunk_size
    while start < len(text):
        chunks.append(text[start:end])
        start = end
        end += chunk_size
    return chunks


def extract_gene_name(text):

    text_str = text.decode("utf-8")
    text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
    pattern = r"<NAME>(.*?)</NAME>"
    match = re.search(pattern, text_str)
    if match:
        gene_name = match.group(1)
        return gene_name
    else:
        return None


def get_geneName(rsid):

    text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
    text = extract_gene_name(text)
    return text


def split_text_into_sentences(text, num_sentences):

    sentences = nltk.sent_tokenize(text)
    grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
    return grouped_sentences


def flatten_list(nested_list):

    flattened_list = []
    for item in nested_list:
        if isinstance(item, list):
            flattened_list.extend(flatten_list(item))
        else:
            flattened_list.append(item)
    return flattened_list


def move_file(source_path, destination_path):

    if not os.path.exists(destination_path):
        os.makedirs(destination_path)

    try:
        shutil.move(source_path, destination_path)
        print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
    except Exception as e:
        print(f"Error: {e}")


def upper_abbreviation(text):
    pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b'
    pattern2 = re.compile(r'unknown', re.IGNORECASE)
    def convert_to_upper(match):
        return match.group(0).replace('.', '').upper()
    text = re.sub(pattern2, '', text)
    output_string = re.sub(pattern1, convert_to_upper, text)
    return output_string


def get_valid_year(input_text):
    four_letter_words = re.findall(r'\b\w{4}\b', input_text)
    result_text = ' '.join(four_letter_words)
    if len(result_text.split(' ')) > 1:
        return ''.join(result_text.split(' ')[0])
    return result_text


def sample_size_postproc(text):
    words = text.split()
    pattern = r'\b[A-Za-z]+\d+\b'
    cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)]
    cleaned_text = ' '.join(cleaned_words)
    cleaned_text = re.sub(pattern, '', cleaned_text)
    return cleaned_text