Spaces:
Build error
Build error
import os | |
import shutil | |
import textwrap | |
import nltk | |
import re | |
from Bio import Entrez | |
def replace_quotes(text): | |
pattern = r'(?<=")[^"]*(?=")' | |
return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text) | |
def clean_text(text): | |
"""Remove section titles and figure descriptions from text""" | |
pattern = r'[^\w\s]' | |
clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")]) | |
return re.sub(pattern, '', clean) | |
def truncate_text(text, max_tokens): | |
wrapper = textwrap.TextWrapper(width=max_tokens) | |
truncated_text = wrapper.wrap(text) | |
if len(truncated_text) > 0: | |
return truncated_text[0] | |
else: | |
return "" | |
def split_text(text, chunk_size): | |
chunks = [] | |
start = 0 | |
end = chunk_size | |
while start < len(text): | |
chunks.append(text[start:end]) | |
start = end | |
end += chunk_size | |
return chunks | |
def extract_gene_name(text): | |
text_str = text.decode("utf-8") | |
text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'") | |
pattern = r"<NAME>(.*?)</NAME>" | |
match = re.search(pattern, text_str) | |
if match: | |
gene_name = match.group(1) | |
return gene_name | |
else: | |
return None | |
def get_geneName(rsid): | |
text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read() | |
text = extract_gene_name(text) | |
return text | |
def split_text_into_sentences(text, num_sentences): | |
sentences = nltk.sent_tokenize(text) | |
grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)] | |
return grouped_sentences | |
def flatten_list(nested_list): | |
flattened_list = [] | |
for item in nested_list: | |
if isinstance(item, list): | |
flattened_list.extend(flatten_list(item)) | |
else: | |
flattened_list.append(item) | |
return flattened_list | |
def move_file(source_path, destination_path): | |
if not os.path.exists(destination_path): | |
os.makedirs(destination_path) | |
try: | |
shutil.move(source_path, destination_path) | |
print(f"File moved successfully from '{source_path}' to '{destination_path}'.") | |
except Exception as e: | |
print(f"Error: {e}") | |
def upper_abbreviation(text): | |
pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b' | |
pattern2 = re.compile(r'unknown', re.IGNORECASE) | |
def convert_to_upper(match): | |
return match.group(0).replace('.', '').upper() | |
text = re.sub(pattern2, '', text) | |
output_string = re.sub(pattern1, convert_to_upper, text) | |
return output_string | |
def get_valid_year(input_text): | |
four_letter_words = re.findall(r'\b\w{4}\b', input_text) | |
result_text = ' '.join(four_letter_words) | |
if len(result_text.split(' ')) > 1: | |
return ''.join(result_text.split(' ')[0]) | |
return result_text | |
def sample_size_postproc(text): | |
words = text.split() | |
pattern = r'\b[A-Za-z]+\d+\b' | |
cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)] | |
cleaned_text = ' '.join(cleaned_words) | |
cleaned_text = re.sub(pattern, '', cleaned_text) | |
return cleaned_text |