import os import shutil import textwrap import nltk import re from Bio import Entrez def replace_quotes(text): pattern = r'(?<=")[^"]*(?=")' return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text) def clean_text(text): """Remove section titles and figure descriptions from text""" pattern = r'[^\w\s]' clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")]) return re.sub(pattern, '', clean) def truncate_text(text, max_tokens): wrapper = textwrap.TextWrapper(width=max_tokens) truncated_text = wrapper.wrap(text) if len(truncated_text) > 0: return truncated_text[0] else: return "" def split_text(text, chunk_size): chunks = [] start = 0 end = chunk_size while start < len(text): chunks.append(text[start:end]) start = end end += chunk_size return chunks def extract_gene_name(text): text_str = text.decode("utf-8") text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'") pattern = r"(.*?)" match = re.search(pattern, text_str) if match: gene_name = match.group(1) return gene_name else: return None def get_geneName(rsid): text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read() text = extract_gene_name(text) return text def split_text_into_sentences(text, num_sentences): sentences = nltk.sent_tokenize(text) grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)] return grouped_sentences def flatten_list(nested_list): flattened_list = [] for item in nested_list: if isinstance(item, list): flattened_list.extend(flatten_list(item)) else: flattened_list.append(item) return flattened_list def move_file(source_path, destination_path): if not os.path.exists(destination_path): os.makedirs(destination_path) try: shutil.move(source_path, destination_path) print(f"File moved successfully from '{source_path}' to '{destination_path}'.") except Exception as e: print(f"Error: {e}") def upper_abbreviation(text): pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b' pattern2 = re.compile(r'unknown', re.IGNORECASE) def convert_to_upper(match): return match.group(0).replace('.', '').upper() text = re.sub(pattern2, '', text) output_string = re.sub(pattern1, convert_to_upper, text) return output_string def get_valid_year(input_text): four_letter_words = re.findall(r'\b\w{4}\b', input_text) result_text = ' '.join(four_letter_words) if len(result_text.split(' ')) > 1: return ''.join(result_text.split(' ')[0]) return result_text def sample_size_postproc(text): words = text.split() pattern = r'\b[A-Za-z]+\d+\b' cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)] cleaned_text = ' '.join(cleaned_words) cleaned_text = re.sub(pattern, '', cleaned_text) return cleaned_text