Spaces:
No application file
No application file
| import pandas as pd | |
| import numpy as np | |
| def analyze_and_fill_usage( | |
| usage_data: pd.DataFrame, | |
| gap_threshold: int = 62, | |
| fill_earliest_cutoff: str = "2013-01-01", | |
| min_fill_gap_months: int = 9, | |
| rolling_window_size: int = 4, | |
| rolling_centered: bool = True, | |
| sequence_fill_method: str = "mean", | |
| post_missing_threshold: float = 0.1, | |
| ) -> pd.DataFrame: | |
| """ | |
| 对 usage_data 进行缺失类型分析,计算 FillStartDate 和 NotGonnaUse, | |
| 返回 summary_df,只包含统计指标,不做时间序列填补。 | |
| """ | |
| df = usage_data.copy() | |
| df["StartDate"] = pd.to_datetime(df["StartDate"]) | |
| df["EndDate"] = pd.to_datetime(df["EndDate"]) - pd.Timedelta(days=2) | |
| # 🔧 修复:自动检测建筑列名,兼容两种格式 | |
| building_col = None | |
| if "BuildingName" in df.columns: | |
| building_col = "BuildingName" | |
| elif "Building Name" in df.columns: | |
| building_col = "Building Name" | |
| # 创建标准化列名供后续使用 | |
| df["BuildingName"] = df["Building Name"] | |
| else: | |
| raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data") | |
| records = [] | |
| # 🔧 修复:始终使用BuildingName进行groupby,确保一致性 | |
| for (bld, util), grp in df.groupby(["BuildingName", "CommodityCode"]): | |
| # 完整月份索引 | |
| start = grp["StartDate"].min().replace(day=1) | |
| end = grp["StartDate"].max().replace(day=1) | |
| full_idx = pd.date_range(start, end, freq="MS") | |
| flag = pd.Series(0, index=full_idx) | |
| flag.loc[grp["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1 | |
| missing = flag[flag == 0].index | |
| # 统计 Random / Sequence 缺失 | |
| seq_ranges, rand_dates = [], [] | |
| rand_months = seq_months = 0 | |
| if missing.empty: | |
| mtype = "No Missing" | |
| else: | |
| gaps = missing.to_series().diff().dt.days.fillna(9999) | |
| gid = (gaps > gap_threshold).cumsum() | |
| for _, seg in missing.to_series().groupby(gid): | |
| if len(seg) > 1: | |
| seq_ranges.append( | |
| f"From {seg.min().strftime('%Y-%m')} to {seg.max().strftime('%Y-%m')}" | |
| ) | |
| seq_months += len(seg) | |
| else: | |
| rand_dates.append(seg.iloc[0].strftime("%Y-%m")) | |
| rand_months += 1 | |
| has_rand, has_seq = bool(rand_dates), bool(seq_ranges) | |
| if has_rand and has_seq: | |
| mtype = "Both" | |
| elif has_rand: | |
| mtype = "Random" | |
| else: | |
| mtype = "Sequence" | |
| records.append( | |
| { | |
| "BuildingName": bld, | |
| "CommodityCode": util, | |
| "MissingType": mtype, | |
| "SequenceMissingRanges": "; ".join(seq_ranges), | |
| "RandomMissingDates": "; ".join(rand_dates), | |
| "TotalMonths": len(full_idx), | |
| "RandomMissingMonths": rand_months, | |
| "SequenceMissingMonths": seq_months, | |
| "RandomMissingRatio": rand_months / len(full_idx) | |
| if full_idx.size | |
| else 0, | |
| "SequenceMissingRatio": seq_months / len(full_idx) | |
| if full_idx.size | |
| else 0, | |
| } | |
| ) | |
| summary_df = pd.DataFrame(records) | |
| # ------------------------------------------------------------- | |
| # 计算 FillStartDate | |
| # ------------------------------------------------------------- | |
| cutoff_dt = pd.to_datetime(fill_earliest_cutoff) | |
| def get_fill_start(r): | |
| """ | |
| 根据用户澄清的正确策略计算FillStartDate: | |
| 1. 不管2013年之前是否有序列缺失,都检查2013年之后是否存在≥9个月的序列缺失 | |
| 2. 如果2013年之后存在≥9个月缺失 → 返回缺失结束时间+1个月 | |
| 3. 如果2013年之后没有≥9个月缺失 → 返回2013-01-01 | |
| """ | |
| try: | |
| # 解析所有序列缺失范围 | |
| seq_ranges = [] | |
| seq_missing_ranges = r.get("SequenceMissingRanges", "") | |
| if not seq_missing_ranges or pd.isna(seq_missing_ranges): | |
| # 没有序列缺失数据,从2013年开始 | |
| return cutoff_dt | |
| for rng in str(seq_missing_ranges).split("; "): | |
| if "to" not in rng: | |
| continue | |
| try: | |
| s, e = rng.replace("From ", "").split(" to ") | |
| sd, ed = pd.to_datetime(s), pd.to_datetime(e) | |
| gap = (ed.to_period("M") - sd.to_period("M")).n + 1 | |
| seq_ranges.append((sd, ed, gap)) | |
| except Exception: | |
| # 跳过无法解析的日期范围,继续处理其他范围 | |
| continue | |
| if not seq_ranges: | |
| # 没有有效的序列缺失,从2013年开始 | |
| return cutoff_dt | |
| # 🔍 关键:只关注2013年之后的序列缺失(开始时间>=2013-01-01) | |
| # 🔧 修复:确保正确访问min_fill_gap_months变量 | |
| post_2013_missing = [ | |
| (sd, ed, gap) for sd, ed, gap in seq_ranges | |
| if sd >= cutoff_dt and gap >= min_fill_gap_months | |
| ] | |
| # 🔍 如果2013年之后存在≥9个月的序列缺失 | |
| if post_2013_missing: | |
| # 按开始时间排序,取第一个符合条件的缺失 | |
| post_2013_missing.sort(key=lambda x: x[0]) | |
| sd, ed, gap = post_2013_missing[0] | |
| return ed + pd.offsets.MonthBegin(1) | |
| # 🔍 如果2013年之后没有≥9个月的序列缺失,从2013年开始 | |
| return cutoff_dt | |
| except Exception as e: | |
| # 🔧 关键修复:如果任何解析步骤失败,总是返回默认的cutoff_dt | |
| # 这确保get_fill_start永远不会抛出异常,从而避免pandas.apply返回NaT | |
| # 🔧 新增:记录异常信息用于调试 | |
| import traceback | |
| print(f"get_fill_start exception for {r.get('BuildingName', 'Unknown')}: {e}") | |
| print(f"Traceback: {traceback.format_exc()}") | |
| return cutoff_dt | |
| summary_df["FillStartDate"] = summary_df.apply(get_fill_start, axis=1) | |
| # ------------------------------------------------------------- | |
| # 计算填充后缺失比率 | |
| # ------------------------------------------------------------- | |
| post_recs = [] | |
| for _, r in summary_df.iterrows(): | |
| bld, util, fsd = r["BuildingName"], r["CommodityCode"], r["FillStartDate"] | |
| if pd.isna(fsd): | |
| continue | |
| grp2 = df[ | |
| (df["BuildingName"] == bld) | |
| & (df["CommodityCode"] == util) | |
| & (df["StartDate"] >= fsd) | |
| ] | |
| if grp2.empty: | |
| continue | |
| idx2 = pd.date_range( | |
| fsd.replace(day=1), grp2["StartDate"].max().replace(day=1), freq="MS" | |
| ) | |
| flag2 = pd.Series(0, index=idx2) | |
| flag2.loc[grp2["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1 | |
| miss2 = flag2[flag2 == 0].index | |
| gaps2 = pd.Series(miss2).diff().dt.days.fillna(9999) | |
| gid2 = (gaps2 > gap_threshold).cumsum() | |
| rm2 = sm2 = 0 | |
| for _, seg2 in pd.Series(miss2).groupby(gid2): | |
| if len(seg2) > 1: | |
| sm2 += len(seg2) | |
| else: | |
| rm2 += 1 | |
| post_recs.append( | |
| { | |
| "BuildingName": bld, | |
| "CommodityCode": util, | |
| "PostTotalMonths": len(idx2), | |
| "PostRandomMissingMonths": rm2, | |
| "PostSequenceMissingMonths": sm2, | |
| "PostRandomMissingRatio": rm2 / len(idx2) if idx2.size else 0, | |
| "PostSequenceMissingRatio": sm2 / len(idx2) if idx2.size else 0, | |
| } | |
| ) | |
| post_df = pd.DataFrame(post_recs) | |
| # 🔧 修复:如果post_df为空,需要创建包含所有必要列的空DataFrame | |
| if post_df.empty: | |
| # 创建一个与summary_df结构匹配的空DataFrame | |
| post_df = pd.DataFrame(columns=[ | |
| "BuildingName", "CommodityCode", "PostTotalMonths", | |
| "PostRandomMissingMonths", "PostSequenceMissingMonths", | |
| "PostRandomMissingRatio", "PostSequenceMissingRatio" | |
| ]) | |
| summary_df = summary_df.merge(post_df, on=["BuildingName", "CommodityCode"], how="left") | |
| # 🔧 修复:填充缺失的post分析列为默认值 | |
| post_columns = ["PostTotalMonths", "PostRandomMissingMonths", "PostSequenceMissingMonths", | |
| "PostRandomMissingRatio", "PostSequenceMissingRatio"] | |
| for col in post_columns: | |
| if col not in summary_df.columns: | |
| if "Ratio" in col: | |
| summary_df[col] = 0.0 # 比率列默认为0 | |
| else: | |
| summary_df[col] = 0 # 月数列默认为0 | |
| summary_df["NotGonnaUse"] = ( | |
| (summary_df["PostRandomMissingRatio"] > post_missing_threshold) | |
| | (summary_df["PostSequenceMissingRatio"] > post_missing_threshold) | |
| ).astype(int) | |
| return summary_df | |
| def fill_usage_with_sequence_check_strict_mean( | |
| usage_data: pd.DataFrame, | |
| summary_df: pd.DataFrame, | |
| method: str = "mean", | |
| force: bool = False, | |
| fill_earliest_cutoff: str = "1900-01-01", | |
| ) -> pd.DataFrame: | |
| """ | |
| 根据 summary_df 的 FillStartDate / NotGonnaUse 对 usage_data 进行填补。 | |
| 参数 | |
| ---------- | |
| usage_data : 原始用量表 | |
| summary_df : analyze_and_fill_usage 的结果 | |
| method : 'mean' 或 'median' | |
| force : True 时忽略 NotGonnaUse 和 NaT,自动调整起点,保证能输出序列 | |
| fill_earliest_cutoff : 当 FillStartDate 为 NaT 时的回退起点(仅在 force=True 时使用) | |
| 返回 | |
| ---------- | |
| filled_df : ['BuildingName','CommodityCode','Date','FilledUse'] | |
| """ | |
| df = usage_data.copy() | |
| df["StartDate"] = pd.to_datetime(df["StartDate"]).dt.to_period("M").dt.to_timestamp() | |
| # 🔧 修复:自动检测建筑列名,兼容两种格式 | |
| building_col = None | |
| if "BuildingName" in df.columns: | |
| building_col = "BuildingName" | |
| elif "Building Name" in df.columns: | |
| building_col = "Building Name" | |
| # 创建标准化列名供后续使用 | |
| df["BuildingName"] = df["Building Name"] | |
| else: | |
| raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data") | |
| all_records = [] | |
| for _, row in summary_df.iterrows(): | |
| bld, util, fsd, drop = ( | |
| row["BuildingName"], | |
| row["CommodityCode"], | |
| row["FillStartDate"], | |
| row["NotGonnaUse"], | |
| ) | |
| # ───────── 闸门 1 & 2 ───────── | |
| if not force and (drop == 1 or pd.isna(fsd)): | |
| # 严格模式:缺失率过高 或 没有有效起点 → 直接跳过 | |
| continue | |
| if force and pd.isna(fsd): | |
| fsd = pd.to_datetime(fill_earliest_cutoff) | |
| # 取 >= fsd 的原始数据(使用标准化的BuildingName列) | |
| grp = df[ | |
| (df["BuildingName"] == bld) | |
| & (df["CommodityCode"] == util) | |
| & (df["StartDate"] >= fsd) | |
| ] | |
| # ───────── 闸门 3 ───────── | |
| if grp.empty: | |
| if not force: | |
| continue | |
| # 强制模式:回退到该组合最早月份 | |
| grp = df[(df["BuildingName"] == bld) & (df["CommodityCode"] == util)] | |
| if grp.empty: | |
| # 数据确实不存在 | |
| continue | |
| fsd = grp["StartDate"].min() | |
| last_m = grp["StartDate"].max() | |
| all_months = pd.date_range(fsd, last_m, freq="MS") | |
| monthly = grp.groupby("StartDate")["Use"].sum().reindex(all_months) | |
| base = monthly.dropna() | |
| fill_val = base.median() if method == "median" else base.mean() | |
| filled = monthly.fillna(fill_val).reset_index() | |
| filled.columns = ["Date", "FilledUse"] | |
| filled["BuildingName"] = bld | |
| filled["CommodityCode"] = util | |
| all_records.append(filled) | |
| if not all_records: | |
| return pd.DataFrame(columns=["BuildingName", "CommodityCode", "Date", "FilledUse"]) | |
| return pd.concat(all_records, ignore_index=True) | |
| # =============================================================== | |
| # LLM-based Weather Variable Selection Functions | |
| # =============================================================== | |
| # Building Type → Weather Variable Rule Mapping | |
| weather_influence_map = { | |
| "Office": ["temp_mean", "temp_std", "CDD_sum", "clouds_all_mean"], | |
| "Instructional": ["temp_mean", "temp_std", "CDD_sum", "humidity_mean"], | |
| "Residential": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"], | |
| "Health": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"], | |
| "Library": ["temp_mean", "temp_std", "humidity_mean", "clouds_all_mean"], | |
| "Dining": ["temp_mean", "rain_sum", "CDD_sum"], | |
| "Recreation": ["temp_mean", "rain_sum", "wind_speed_mean"], | |
| "Assembly or Theater": ["temp_mean", "wind_speed_mean", "rain_sum"], | |
| "Affiliate": ["temp_mean", "CDD_sum", "rain_sum"], | |
| "Parking Structure": [], | |
| "Infrastructure": [], | |
| "Container": [], | |
| "Mixed": ["temp_mean", "CDD_sum", "humidity_mean"], | |
| "Other": ["temp_mean", "CDD_sum"], | |
| } | |
| def infer_building_type(text: str) -> str: | |
| """推断建筑类型基于文本描述""" | |
| patterns = { | |
| "Instructional": ["Teaching", "Classroom", "School", "Lecture Hall", "Academic", "Education"], | |
| "Residential": ["Residential", "Apartment", "Dormitory", "Housing", "Student"], | |
| "Office": ["Office", "Office Building", "Administrative", "Admin"], | |
| "Health": ["Hospital", "Medical", "Clinic", "Health"], | |
| "Dining": ["Canteen", "Restaurant", "Dining", "Food", "Kitchen"], | |
| "Recreation": ["Fitness", "Sports", "Entertainment", "Recreation", "Gym"], | |
| "Library": ["Library"], | |
| "Assembly or Theater": ["Theater", "Auditorium", "Performance", "Assembly"], | |
| "Affiliate": ["Affiliate"], | |
| } | |
| text_lower = text.lower() | |
| for btype, keywords in patterns.items(): | |
| if any(k.lower() in text_lower for k in keywords): | |
| return btype | |
| return "Mixed" | |
| def construct_weather_prompt_static( | |
| user_description: str, detected_type: str, suggested_vars: list, | |
| gross_area: float, avg_space_sqft: float, workpoint_count: int, floor_count: int | |
| ) -> str: | |
| """构建静态weather prompt""" | |
| scenario_note = """ | |
| Weather-Scenario (z-score offset, user will pick one): | |
| • Normal → 0 σ offset (historical monthly mean) | |
| • Hot → +1 σ on temp_mean & CDD_sum, −0.5 σ on humidity_mean | |
| • ColdWet → −1 σ on temp_mean, +1 σ on HDD_sum & humidity_mean | |
| • WindyCloudy → +1 σ on wind_speed_mean & clouds_all_mean | |
| LLM only needs to recommend variables; offsets are applied downstream. | |
| """ | |
| return f""" | |
| You are an expert in building energy modeling and changepoint detection. | |
| {scenario_note} | |
| Building Description (current use only): | |
| {user_description} | |
| Inferred Operation Type: {detected_type} | |
| Structural Information: | |
| - Building Gross Area: {gross_area:,.0f} sq ft | |
| - Average Space Size: {avg_space_sqft:,.0f} sq ft | |
| - Total Workpoint Count: {workpoint_count} | |
| - Floor Count: {floor_count} | |
| Rule-based Suggested Variables: {', '.join(suggested_vars)} | |
| Candidate Weather Variables: | |
| temp_mean · temp_std · HDD_sum · CDD_sum · rain_sum · clouds_all_mean · humidity_mean · wind_speed_mean | |
| Tasks: | |
| 1. Select 3–5 variables that best capture energy-use behaviour under the current configuration. | |
| 2. Briefly justify each choice. | |
| 3. Return a markdown table with columns: Selected Variable | Reason. | |
| """ | |
| def chat_with_ollama(messages: list, model: str = "mistral") -> str: | |
| """与Ollama API聊天""" | |
| import requests | |
| url = "http://localhost:11434/api/chat" | |
| try: | |
| response = requests.post( | |
| url, | |
| json={"model": model, "messages": messages, "stream": False}, | |
| timeout=30 | |
| ) | |
| response.raise_for_status() | |
| return response.json()["message"]["content"] | |
| except requests.exceptions.RequestException as e: | |
| raise Exception(f"Ollama API error: {str(e)}") | |
| except KeyError: | |
| raise Exception("Invalid response format from Ollama API") | |
| def parse_selected_vars(md: str) -> list: | |
| """解析markdown表格中的变量列表""" | |
| vars_ = [] | |
| for row in md.strip().splitlines(): | |
| if row.startswith("|") and not row.startswith("| Selected"): | |
| parts = row.split("|") | |
| if len(parts) > 1: | |
| first = parts[1].strip() | |
| if first and first != '---' and first not in ["Selected Variable", ""]: | |
| vars_.append(first) | |
| return vars_ |