NutriGenMePE / utils.py
firqaaa's picture
Upload 6 files
eb88b82
import os
import shutil
import textwrap
import nltk
import re
from Bio import Entrez
def replace_quotes(text):
pattern = r'(?<=")[^"]*(?=")'
return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)
def clean_text(text):
"""Remove section titles and figure descriptions from text"""
pattern = r'[^\w\s]'
clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
return re.sub(pattern, '', clean)
def truncate_text(text, max_tokens):
wrapper = textwrap.TextWrapper(width=max_tokens)
truncated_text = wrapper.wrap(text)
if len(truncated_text) > 0:
return truncated_text[0]
else:
return ""
def split_text(text, chunk_size):
chunks = []
start = 0
end = chunk_size
while start < len(text):
chunks.append(text[start:end])
start = end
end += chunk_size
return chunks
def extract_gene_name(text):
text_str = text.decode("utf-8")
text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
pattern = r"<NAME>(.*?)</NAME>"
match = re.search(pattern, text_str)
if match:
gene_name = match.group(1)
return gene_name
else:
return None
def get_geneName(rsid):
text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
text = extract_gene_name(text)
return text
def split_text_into_sentences(text, num_sentences):
sentences = nltk.sent_tokenize(text)
grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
return grouped_sentences
def flatten_list(nested_list):
flattened_list = []
for item in nested_list:
if isinstance(item, list):
flattened_list.extend(flatten_list(item))
else:
flattened_list.append(item)
return flattened_list
def move_file(source_path, destination_path):
if not os.path.exists(destination_path):
os.makedirs(destination_path)
try:
shutil.move(source_path, destination_path)
print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
except Exception as e:
print(f"Error: {e}")
def upper_abbreviation(text):
pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b'
pattern2 = re.compile(r'unknown', re.IGNORECASE)
def convert_to_upper(match):
return match.group(0).replace('.', '').upper()
text = re.sub(pattern2, '', text)
output_string = re.sub(pattern1, convert_to_upper, text)
return output_string
def get_valid_year(input_text):
four_letter_words = re.findall(r'\b\w{4}\b', input_text)
result_text = ' '.join(four_letter_words)
if len(result_text.split(' ')) > 1:
return ''.join(result_text.split(' ')[0])
return result_text
def sample_size_postproc(text):
words = text.split()
pattern = r'\b[A-Za-z]+\d+\b'
cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)]
cleaned_text = ' '.join(cleaned_words)
cleaned_text = re.sub(pattern, '', cleaned_text)
return cleaned_text