vshulev commited on
Commit
cfbe98d
·
1 Parent(s): 5d47dac

Update app

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. app.old.py +183 -0
  3. app.py +22 -171
  4. default_inputs.json +5 -0
  5. requirements.txt +4 -3
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv
app.old.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ # import matplotlib.pyplot as plt
4
+ import gradio as gr
5
+ import numpy as np
6
+ import xgboost_infer
7
+
8
+ # def predict_genus_dna(dnaSeqs):
9
+ # genuses = []
10
+
11
+ # # probs = dnamodel.predict_proba(dnaSeqs)
12
+ # # preds = dnamodel.predict(dnaSeqs)
13
+ # # topProb = np.argsort(probs, axis=1)[:,-3:]
14
+ # # topClass = dnamodel.classes_[topProb]
15
+
16
+ # # pred_df = pd.DataFrame(data=[topClass, topProb], columns= ['Genus', 'Probability'])
17
+
18
+ # return genuses
19
+
20
+ # def predict_genus_dna_env(dnaSeqsEnv):
21
+ # genuses = {}
22
+ # probs = model.predict_proba(dnaSeqsEnv)
23
+ # preds = model.predict(dnaSeqsEnv)
24
+
25
+ # for i in range(len(dnaSeqsEnv)):
26
+ # topProb = np.argsort(probs[i], axis=1)[:,-3:]
27
+ # topClass = model.classes_[topProb]
28
+
29
+ # sampleStr = dnaSeqsEnv['nucraw'][i]
30
+ # genuses[sampleStr] = (topClass, topProb)
31
+
32
+ # pred_df = pd.DataFrame(data=[top5class, top5prob], columns= ['Genus', 'Probability'])
33
+
34
+ # return genuses
35
+
36
+ # def get_genus_image(genus):
37
+ # # return a URL to genus image
38
+ # return f"https://example.com/images/{genus}.jpg"
39
+
40
+ def get_genuses(dna_file, dnaenv_file):
41
+ dna_df = pd.read_csv(dna_file.name)
42
+ dnaenv_df = pd.read_csv(dnaenv_file.name)
43
+
44
+ results = []
45
+
46
+ # envdna_genuses = predict_genus_dna_env(dnaenv_df)
47
+ # dna_genuses = predict_genus_dna(dna_df)
48
+ # images = [get_genus_image(genus) for genus in top_5_genuses]
49
+
50
+ genuses = xgboost_infer.infer()
51
+
52
+ results.append({
53
+ "sequence": dna_df['nucraw'],
54
+ # "predictions": pd.concat([dna_genuses, envdna_genuses], axis=0)
55
+ 'predictions': genuses
56
+ })
57
+
58
+ return results
59
+
60
+ def display_results(results):
61
+ display = []
62
+ for result in results:
63
+ # for i in range(len(result["predictions"])):
64
+ # display.append({
65
+ # "DNA Sequence": result["sequence"],
66
+ # "DNA Pred Genus": result['predictions'][i][0],
67
+ # "DNA Only Prob": result['predictions'][i][1],
68
+ # "DNA Env Pred Genus": result['predictions'][i][2],
69
+ # "DNA Env Prob": result['predictions'][i][3],
70
+ # # "Image": result["images"][i]
71
+ # })
72
+ display.append({
73
+ "DNA Sequence": result["sequence"],
74
+ "DNA Pred Genus": result['predictions'][0]
75
+ })
76
+ return pd.DataFrame(display)
77
+
78
+ def gradio_interface(file):
79
+ results = get_genuses(file)
80
+ return display_results(results)
81
+
82
+ # Gradio interface
83
+ with gr.Blocks() as demo:
84
+ with gr.Column():
85
+ gr.Markdown("# DNA Identifier Tool")
86
+ file_input = gr.File(label="Upload DNA CSV file", file_types=['csv'])
87
+ output_table = gr.Dataframe(headers=["DNA", "Coord", "DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
88
+
89
+ def update_output(file):
90
+ result_df = gradio_interface(file)
91
+ return result_df
92
+
93
+ file_input.change(update_output, inputs=file_input, outputs=output_table)
94
+
95
+ demo.launch()
96
+
97
+
98
+ # with gr.Blocks() as demo:
99
+ # with gr.Row():
100
+ # word = gr.Textbox(label="word")
101
+ # leng = gr.Number(label="leng")
102
+ # output = gr.Textbox(label="Output")
103
+ # with gr.Row():
104
+ # run = gr.Button()
105
+
106
+ # event = run.click(predict_genus,
107
+ # [word, leng],
108
+ # output,
109
+ # batch=True,
110
+ # max_batch_size=20)
111
+
112
+ # demo.launch()
113
+
114
+ # DB_USER = os.getenv("DB_USER")
115
+ # DB_PASSWORD = os.getenv("DB_PASSWORD")
116
+ # DB_HOST = os.getenv("DB_HOST")
117
+ # PORT = 8080
118
+ # DB_NAME = "bikeshare"
119
+
120
+ # connection_string = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}?port={PORT}&dbname={DB_NAME}"
121
+
122
+ # def get_count_ride_type():
123
+ # df = pd.read_sql(
124
+ # """
125
+ # SELECT COUNT(ride_id) as n, rideable_type
126
+ # FROM rides
127
+ # GROUP BY rideable_type
128
+ # ORDER BY n DESC
129
+ # """,
130
+ # con=connection_string
131
+ # )
132
+ # fig_m, ax = plt.subplots()
133
+ # ax.bar(x=df['rideable_type'], height=df['n'])
134
+ # ax.set_title("Number of rides by bycycle type")
135
+ # ax.set_ylabel("Number of Rides")
136
+ # ax.set_xlabel("Bicycle Type")
137
+ # return fig_m
138
+
139
+
140
+ # def get_most_popular_stations():
141
+
142
+ # df = pd.read_sql(
143
+ # """
144
+ # SELECT COUNT(ride_id) as n, MAX(start_station_name) as station
145
+ # FROM RIDES
146
+ # WHERE start_station_name is NOT NULL
147
+ # GROUP BY start_station_id
148
+ # ORDER BY n DESC
149
+ # LIMIT 5
150
+ # """,
151
+ # con=connection_string
152
+ # )
153
+ # fig_m, ax = plt.subplots()
154
+ # ax.bar(x=df['station'], height=df['n'])
155
+ # ax.set_title("Most popular stations")
156
+ # ax.set_ylabel("Number of Rides")
157
+ # ax.set_xlabel("Station Name")
158
+ # ax.set_xticklabels(
159
+ # df['station'], rotation=45, ha="right", rotation_mode="anchor"
160
+ # )
161
+ # ax.tick_params(axis="x", labelsize=8)
162
+ # fig_m.tight_layout()
163
+ # return fig_m
164
+
165
+
166
+ # with gr.Blocks() as demo:
167
+ # with gr.Row():
168
+ # bike_type = gr.Plot()
169
+ # station = gr.Plot()
170
+
171
+ # demo.load(get_count_ride_type, inputs=None, outputs=bike_type)
172
+ # demo.load(get_most_popular_stations, inputs=None, outputs=station)
173
+
174
+ # def greet(name, intensity):
175
+ # return "Hello, " + name + "!" * int(intensity)
176
+
177
+ # demo = gr.Interface(
178
+ # fn=greet,
179
+ # inputs=["text", "slider"],
180
+ # outputs=["text"],
181
+ # )
182
+
183
+ demo.launch()
app.py CHANGED
@@ -1,183 +1,34 @@
1
- import os
2
- import pandas as pd
3
- # import matplotlib.pyplot as plt
4
- import gradio as gr
5
- import numpy as np
6
- import xgboost_infer
7
-
8
- # def predict_genus_dna(dnaSeqs):
9
- # genuses = []
10
-
11
- # # probs = dnamodel.predict_proba(dnaSeqs)
12
- # # preds = dnamodel.predict(dnaSeqs)
13
- # # topProb = np.argsort(probs, axis=1)[:,-3:]
14
- # # topClass = dnamodel.classes_[topProb]
15
-
16
- # # pred_df = pd.DataFrame(data=[topClass, topProb], columns= ['Genus', 'Probability'])
17
-
18
- # return genuses
19
-
20
- # def predict_genus_dna_env(dnaSeqsEnv):
21
- # genuses = {}
22
- # probs = model.predict_proba(dnaSeqsEnv)
23
- # preds = model.predict(dnaSeqsEnv)
24
-
25
- # for i in range(len(dnaSeqsEnv)):
26
- # topProb = np.argsort(probs[i], axis=1)[:,-3:]
27
- # topClass = model.classes_[topProb]
28
 
29
- # sampleStr = dnaSeqsEnv['nucraw'][i]
30
- # genuses[sampleStr] = (topClass, topProb)
31
-
32
- # pred_df = pd.DataFrame(data=[top5class, top5prob], columns= ['Genus', 'Probability'])
33
-
34
- # return genuses
35
 
36
- # def get_genus_image(genus):
37
- # # return a URL to genus image
38
- # return f"https://example.com/images/{genus}.jpg"
39
 
40
- def get_genuses(dna_file, dnaenv_file):
41
- dna_df = pd.read_csv(dna_file.name)
42
- dnaenv_df = pd.read_csv(dnaenv_file.name)
43
-
44
- results = []
45
-
46
- # envdna_genuses = predict_genus_dna_env(dnaenv_df)
47
- # dna_genuses = predict_genus_dna(dna_df)
48
- # images = [get_genus_image(genus) for genus in top_5_genuses]
49
 
50
- genuses = xgboost_infer.infer()
51
-
52
- results.append({
53
- "sequence": dna_df['nucraw'],
54
- # "predictions": pd.concat([dna_genuses, envdna_genuses], axis=0)
55
- 'predictions': genuses
56
- })
57
-
58
- return results
59
 
60
- def display_results(results):
61
- display = []
62
- for result in results:
63
- # for i in range(len(result["predictions"])):
64
- # display.append({
65
- # "DNA Sequence": result["sequence"],
66
- # "DNA Pred Genus": result['predictions'][i][0],
67
- # "DNA Only Prob": result['predictions'][i][1],
68
- # "DNA Env Pred Genus": result['predictions'][i][2],
69
- # "DNA Env Prob": result['predictions'][i][3],
70
- # # "Image": result["images"][i]
71
- # })
72
- display.append({
73
- "DNA Sequence": result["sequence"],
74
- "DNA Pred Genus": result['predictions'][0]
75
- })
76
- return pd.DataFrame(display)
77
 
78
- def gradio_interface(file):
79
- results = get_genuses(file)
80
- return display_results(results)
81
 
82
- # Gradio interface
83
  with gr.Blocks() as demo:
84
- with gr.Column():
85
- gr.Markdown("# DNA Identifier Tool")
86
- file_input = gr.File(label="Upload DNA CSV file", file_types=['csv'])
87
- output_table = gr.Dataframe(headers=["DNA", "Coord", "DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
88
-
89
- def update_output(file):
90
- result_df = gradio_interface(file)
91
- return result_df
92
-
93
- file_input.change(update_output, inputs=file_input, outputs=output_table)
94
-
95
- demo.launch()
96
-
97
-
98
- # with gr.Blocks() as demo:
99
- # with gr.Row():
100
- # word = gr.Textbox(label="word")
101
- # leng = gr.Number(label="leng")
102
- # output = gr.Textbox(label="Output")
103
- # with gr.Row():
104
- # run = gr.Button()
105
 
106
- # event = run.click(predict_genus,
107
- # [word, leng],
108
- # output,
109
- # batch=True,
110
- # max_batch_size=20)
 
111
 
112
- # demo.launch()
 
113
 
114
- # DB_USER = os.getenv("DB_USER")
115
- # DB_PASSWORD = os.getenv("DB_PASSWORD")
116
- # DB_HOST = os.getenv("DB_HOST")
117
- # PORT = 8080
118
- # DB_NAME = "bikeshare"
119
 
120
- # connection_string = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}?port={PORT}&dbname={DB_NAME}"
121
-
122
- # def get_count_ride_type():
123
- # df = pd.read_sql(
124
- # """
125
- # SELECT COUNT(ride_id) as n, rideable_type
126
- # FROM rides
127
- # GROUP BY rideable_type
128
- # ORDER BY n DESC
129
- # """,
130
- # con=connection_string
131
- # )
132
- # fig_m, ax = plt.subplots()
133
- # ax.bar(x=df['rideable_type'], height=df['n'])
134
- # ax.set_title("Number of rides by bycycle type")
135
- # ax.set_ylabel("Number of Rides")
136
- # ax.set_xlabel("Bicycle Type")
137
- # return fig_m
138
-
139
-
140
- # def get_most_popular_stations():
141
-
142
- # df = pd.read_sql(
143
- # """
144
- # SELECT COUNT(ride_id) as n, MAX(start_station_name) as station
145
- # FROM RIDES
146
- # WHERE start_station_name is NOT NULL
147
- # GROUP BY start_station_id
148
- # ORDER BY n DESC
149
- # LIMIT 5
150
- # """,
151
- # con=connection_string
152
- # )
153
- # fig_m, ax = plt.subplots()
154
- # ax.bar(x=df['station'], height=df['n'])
155
- # ax.set_title("Most popular stations")
156
- # ax.set_ylabel("Number of Rides")
157
- # ax.set_xlabel("Station Name")
158
- # ax.set_xticklabels(
159
- # df['station'], rotation=45, ha="right", rotation_mode="anchor"
160
- # )
161
- # ax.tick_params(axis="x", labelsize=8)
162
- # fig_m.tight_layout()
163
- # return fig_m
164
-
165
-
166
- # with gr.Blocks() as demo:
167
- # with gr.Row():
168
- # bike_type = gr.Plot()
169
- # station = gr.Plot()
170
-
171
- # demo.load(get_count_ride_type, inputs=None, outputs=bike_type)
172
- # demo.load(get_most_popular_stations, inputs=None, outputs=station)
173
-
174
- # def greet(name, intensity):
175
- # return "Hello, " + name + "!" * int(intensity)
176
-
177
- # demo = gr.Interface(
178
- # fn=greet,
179
- # inputs=["text", "slider"],
180
- # outputs=["text"],
181
- # )
182
-
183
- demo.launch()
 
1
+ import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ import gradio as gr
 
 
 
 
 
4
 
 
 
 
5
 
6
+ with open("default_inputs.json", "r") as default_inputs_file:
7
+ DEFAULT_INPUTS = json.load(default_inputs_file)
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def set_default_inputs():
11
+ return (DEFAULT_INPUTS["dna_sequence"],
12
+ DEFAULT_INPUTS["latitude"],
13
+ DEFAULT_INPUTS["longitude"])
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
 
 
 
15
 
 
16
  with gr.Blocks() as demo:
17
+ # Header section
18
+ gr.Markdown("# DNA Identifier Tool")
19
+ gr.Markdown("TODO short description of the tool...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Collect inputs for app (DNA and location)
22
+ with gr.Row():
23
+ inp_dna = gr.Textbox(label="DNA", placeholder="e.g. AACAATGTA... (will be automatically truncated to 660 characters)")
24
+ with gr.Row():
25
+ inp_lat = gr.Textbox(label="Latitude", placeholder="e.g. -3.009083")
26
+ inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
27
 
28
+ with gr.Row():
29
+ btn_run = gr.Button("Run")
30
 
31
+ btn_defaults = gr.Button("I'm feeling lucky")
32
+ btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
 
 
 
33
 
34
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
default_inputs.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "dna_sequence": "AACAATGTATTTGATTTTCGCCCTTGTGAATTTATTCGCTGGCGGAACAATGGCATTGTTGATTCGTTTGGAGTTGTTCCAACCTGGCTTGCAATTTTTAAGACCTGAGTTTTTTAATCAGTTAACAACTATGCACGGCCTTATAATGGTTTTCGGTGCAATTATGCCGGCCTTTGTGGGTTTTGCTAACTTGATGATTCCTTTGCAAATTGGTGCCTCTGATATGGCGTTTGCAAGAATGAACAATTTTAGTTTCTGGATTATGCCTGTTGCAGGGATGTTATTATTTGGCTCATTTTTGGCTCCTGGTGGCGCTACTGCAGCTGGTTGGACTTTGTATGCTCCTTTGTCGGTCCAAATGGGGCCTGGTATGGACATGACTATTTTTGCTGTTCACTTGATGGGTGCTTCATCCATTATGGGATCCATTAATATCATTGTGACAATTCTGAATATGCGTGCTCCTGGACTGTCTTTGATGAAGATGCCAATGTTCTGTTGGACATGGTTGATTACTGCATATTTGTTAATTGCGGTTATGCCTGTTTTAGCTGGTGCTATCACTATGGTTCTAACAGACCGTCACTTTGGAACAAGCTTTTTTGCAGCTGCTGGCGGTGGAGACCCTGTAATGTATCAACATATCTTC",
3
+ "latitude": "-3.009083",
4
+ "longitude": "-58.68281"
5
+ }
requirements.txt CHANGED
@@ -3,6 +3,7 @@ pandas==2.2.2
3
  torch==2.3.0
4
  tqdm==4.66.4
5
  transformers==4.41.2
6
- sklearn
7
- numpy
8
- datasets
 
 
3
  torch==2.3.0
4
  tqdm==4.66.4
5
  transformers==4.41.2
6
+ scikit-learn==1.5.0
7
+ numpy==1.26.4
8
+ datasets==2.19.1
9
+ gradio==4.32.2