Spaces:

Ki-Seki
/

AutoTab

Sleeping

App Files Files Community

Ki-Seki commited on Jul 28

Commit

f5cc66a

•

1 Parent(s): 24fa1b6

refactor: optimize the structure

Browse files

Files changed (1) hide show

autotab.py +32 -34

autotab.py CHANGED Viewed

@@ -32,7 +32,14 @@ class AutoTab:
         self.save_every = save_every
         self.api_keys = api_keys
         self.base_url = base_url
         self.request_count = 0
     # ─── IO ───────────────────────────────────────────────────────────────
@@ -67,61 +74,60 @@ class AutoTab:
     # ─── In-Context Learning ──────────────────────────────────────────────
-    def derive_incontext(
-        self, data: pd.DataFrame, input_columns: list[str], output_columns: list[str]
-    ) -> str:
         """Derive the in-context prompt with angle brackets."""
-        n = min(self.max_examples, len(data.dropna(subset=output_columns)))
         in_context = ""
-        for i in range(n):
             in_context += "".join(
-                f"<{col.replace('[Input] ', '')}>{data[col].iloc[i]}</{col.replace('[Input] ', '')}>\n"
-                for col in input_columns
             )
             in_context += "".join(
-                f"<{col.replace('[Output] ', '')}>{data[col].iloc[i]}</{col.replace('[Output] ', '')}>\n"
-                for col in output_columns
             )
             in_context += "\n"
         return in_context
-    def predict_output(
-        self, in_context: str, input_data: pd.DataFrame, input_fields: str
-    ):
         """Predict the output values for the given input data using the API."""
         query = (
             self.instruction
             + "\n\n"
-            + in_context
             + "".join(
                 f"<{col.replace('[Input] ', '')}>{input_data[col]}</{col.replace('[Input] ', '')}>\n"
-                for col in input_fields
             )
         )
         self.query_example = query
         output = self.openai_request(query)
         return output
-    def extract_fields(
-        self, response: str, output_columns: list[str]
-    ) -> dict[str, str]:
         """Extract fields from the response text based on output columns."""
         extracted = {}
-        for col in output_columns:
             field = col.replace("[Output] ", "")
             match = re.search(f"<{field}>(.*?)</{field}>", response)
             extracted[col] = match.group(1) if match else ""
         return extracted
     # ─── Engine ───────────────────────────────────────────────────────────
-    def _predict_and_extract(self, i: int) -> dict[str, str]:
         """Helper function to predict and extract fields for a single row."""
-        prediction = self.predict_output(
-            self.in_context, self.data.iloc[i], self.input_fields
-        )
-        extracted_fields = self.extract_fields(prediction, self.output_fields)
-        return extracted_fields
     def batch_prediction(self, start_index: int, end_index: int):
         """Process a batch of predictions asynchronously."""
@@ -134,16 +140,8 @@ class AutoTab:
                 self.data.at[i, field_name] = extracted_fields.get(field_name, "")
     def run(self):
-        self.data, self.input_fields, self.output_fields = self.load_excel()
-        self.in_context = self.derive_incontext(
-            self.data, self.input_fields, self.output_fields
-        )
-        self.num_data = len(self.data)
-        self.num_examples = len(self.data.dropna(subset=self.output_fields))
-        tqdm_bar = tqdm(total=self.num_data - self.num_examples, leave=False)
-        for start in range(self.num_examples, self.num_data, self.save_every):
             tqdm_bar.update(min(self.save_every, self.num_data - start))
             end = min(start + self.save_every, self.num_data)
             try:

         self.save_every = save_every
         self.api_keys = api_keys
         self.base_url = base_url
         self.request_count = 0
+        self.failed_count = 0
+        self.data, self.input_fields, self.output_fields = self.load_excel()
+        self.in_context = self.derive_incontext()
+        self.num_data = len(self.data)
+        self.num_example = len(self.data.dropna(subset=self.output_fields))
+        self.num_missing = self.num_data - self.num_example
     # ─── IO ───────────────────────────────────────────────────────────────
     # ─── In-Context Learning ──────────────────────────────────────────────
+    def derive_incontext(self) -> str:
         """Derive the in-context prompt with angle brackets."""
+        examples = self.data.dropna(subset=self.output_fields)[: self.max_examples]
         in_context = ""
+        for i in range(len(examples)):
             in_context += "".join(
+                f"<{col.replace('[Input] ', '')}>{self.data[col].iloc[i]}</{col.replace('[Input] ', '')}>\n"
+                for col in self.input_fields
             )
             in_context += "".join(
+                f"<{col.replace('[Output] ', '')}>{self.data[col].iloc[i]}</{col.replace('[Output] ', '')}>\n"
+                for col in self.output_fields
             )
             in_context += "\n"
         return in_context
+    def predict_output(self, input_data: pd.DataFrame):
         """Predict the output values for the given input data using the API."""
         query = (
             self.instruction
             + "\n\n"
+            + self.in_context
             + "".join(
                 f"<{col.replace('[Input] ', '')}>{input_data[col]}</{col.replace('[Input] ', '')}>\n"
+                for col in self.input_fields
             )
         )
         self.query_example = query
         output = self.openai_request(query)
         return output
+    def extract_fields(self, response: str) -> dict[str, str]:
         """Extract fields from the response text based on output columns."""
         extracted = {}
+        for col in self.output_fields:
             field = col.replace("[Output] ", "")
             match = re.search(f"<{field}>(.*?)</{field}>", response)
             extracted[col] = match.group(1) if match else ""
+        if any(extracted[col] == "" for col in self.output_fields):
+            self.failed_count += 1
         return extracted
     # ─── Engine ───────────────────────────────────────────────────────────
+    def _predict_and_extract(self, row: int) -> dict[str, str]:
         """Helper function to predict and extract fields for a single row."""
+        # If any output field is empty, predict the output
+        if any(pd.isnull(self.data.at[row, col]) for col in self.output_fields):
+            prediction = self.predict_output(self.data.iloc[row])
+            extracted_fields = self.extract_fields(prediction)
+            return extracted_fields
+        else:
+            return {col: self.data.at[row, col] for col in self.output_fields}
     def batch_prediction(self, start_index: int, end_index: int):
         """Process a batch of predictions asynchronously."""
                 self.data.at[i, field_name] = extracted_fields.get(field_name, "")
     def run(self):
+        tqdm_bar = tqdm(total=self.num_data, leave=False)
+        for start in range(0, self.num_data, self.save_every):
             tqdm_bar.update(min(self.save_every, self.num_data - start))
             end = min(start + self.save_every, self.num_data)
             try: