Spaces:

mawairon
/

NOOTestspace

Sleeping

App Files Files Community

mawairon commited on Aug 1, 2024

Commit

b1eabde

•

1 Parent(s): 7ad39e0

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -7

app.py CHANGED Viewed

@@ -81,25 +81,44 @@ def load_model():
 model, tokenizer = load_model()
 def analyze_dna(username, password, sequence):
-    valid_usernames = os.getenv('USERNAME')
     env_password = os.getenv('PASSWORD')
     if username not in valid_usernames or password != env_password:
         return {"error": "Invalid username or password"}, ""
     try:
-        sequence = sequence.replace(" ", "")
         if not all(nucleotide in 'ACTGN' for nucleotide in sequence):
             return {"error": "Sequence contains invalid characters"}, ""
         if len(sequence) < 300:
             return {"error": "Sequence needs to be at least 300 nucleotides long"}, ""
-        inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
-        logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
-        probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
         top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
         top_5_probs = [probabilities[i] for i in top_5_indices]
         top_5_labels = [int_to_label[i] for i in top_5_indices]

 model, tokenizer = load_model()
 def analyze_dna(username, password, sequence):
+    valid_usernames = os.getenv('USERNAME').split(',')
     env_password = os.getenv('PASSWORD')
     if username not in valid_usernames or password != env_password:
         return {"error": "Invalid username or password"}, ""
     try:
+        # Remove all whitespace characters
+        sequence = sequence.replace(" ", "").replace("\n", "").replace("\t", "").replace("\r", "")
         if not all(nucleotide in 'ACTGN' for nucleotide in sequence):
             return {"error": "Sequence contains invalid characters"}, ""
         if len(sequence) < 300:
             return {"error": "Sequence needs to be at least 300 nucleotides long"}, ""
+        def get_logits(seq):
+            inputs = tokenizer(seq, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
+            with torch.no_grad():
+                logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+            return logits
+        if len(sequence) > 3000:
+            num_shifts = len(sequence) // 1000
+            logits_sum = None
+            for i in range(num_shifts):
+                shifted_sequence = sequence[i*1000:] + sequence[:i*1000]
+                logits = get_logits(shifted_sequence)
+                if logits_sum is None:
+                    logits_sum = logits
+                else:
+                    logits_sum += logits
+            logits_avg = logits_sum / num_shifts
+        else:
+            logits_avg = get_logits(sequence)
+        probabilities = torch.nn.functional.softmax(logits_avg, dim=-1).squeeze().tolist()
         top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
         top_5_probs = [probabilities[i] for i in top_5_indices]
         top_5_labels = [int_to_label[i] for i in top_5_indices]