|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import json | 
					
						
						|  | from deepdoc.parser.resume.entities import degrees, regions, industries | 
					
						
						|  |  | 
					
						
						|  | FIELDS = [ | 
					
						
						|  | "address STRING", | 
					
						
						|  | "annual_salary int", | 
					
						
						|  | "annual_salary_from int", | 
					
						
						|  | "annual_salary_to int", | 
					
						
						|  | "birth STRING", | 
					
						
						|  | "card STRING", | 
					
						
						|  | "certificate_obj string", | 
					
						
						|  | "city STRING", | 
					
						
						|  | "corporation_id int", | 
					
						
						|  | "corporation_name STRING", | 
					
						
						|  | "corporation_type STRING", | 
					
						
						|  | "degree STRING", | 
					
						
						|  | "discipline_name STRING", | 
					
						
						|  | "education_obj string", | 
					
						
						|  | "email STRING", | 
					
						
						|  | "expect_annual_salary int", | 
					
						
						|  | "expect_city_names string", | 
					
						
						|  | "expect_industry_name STRING", | 
					
						
						|  | "expect_position_name STRING", | 
					
						
						|  | "expect_salary_from int", | 
					
						
						|  | "expect_salary_to int", | 
					
						
						|  | "expect_type STRING", | 
					
						
						|  | "gender STRING", | 
					
						
						|  | "industry_name STRING", | 
					
						
						|  | "industry_names STRING", | 
					
						
						|  | "is_deleted STRING", | 
					
						
						|  | "is_fertility STRING", | 
					
						
						|  | "is_house STRING", | 
					
						
						|  | "is_management_experience STRING", | 
					
						
						|  | "is_marital STRING", | 
					
						
						|  | "is_oversea STRING", | 
					
						
						|  | "language_obj string", | 
					
						
						|  | "name STRING", | 
					
						
						|  | "nation STRING", | 
					
						
						|  | "phone STRING", | 
					
						
						|  | "political_status STRING", | 
					
						
						|  | "position_name STRING", | 
					
						
						|  | "project_obj string", | 
					
						
						|  | "responsibilities string", | 
					
						
						|  | "salary_month int", | 
					
						
						|  | "scale STRING", | 
					
						
						|  | "school_name STRING", | 
					
						
						|  | "self_remark string", | 
					
						
						|  | "skill_obj string", | 
					
						
						|  | "title_name STRING", | 
					
						
						|  | "tob_resume_id STRING", | 
					
						
						|  | "updated_at Timestamp", | 
					
						
						|  | "wechat STRING", | 
					
						
						|  | "work_obj string", | 
					
						
						|  | "work_experience int", | 
					
						
						|  | "work_start_time BIGINT" | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | def refactor(df): | 
					
						
						|  | def deal_obj(obj, k, kk): | 
					
						
						|  | if not isinstance(obj, type({})): | 
					
						
						|  | return "" | 
					
						
						|  | obj = obj.get(k, {}) | 
					
						
						|  | if not isinstance(obj, type({})): | 
					
						
						|  | return "" | 
					
						
						|  | return obj.get(kk, "") | 
					
						
						|  |  | 
					
						
						|  | def loadjson(line): | 
					
						
						|  | try: | 
					
						
						|  | return json.loads(line) | 
					
						
						|  | except Exception: | 
					
						
						|  | pass | 
					
						
						|  | return {} | 
					
						
						|  |  | 
					
						
						|  | df["obj"] = df["resume_content"].map(lambda x: loadjson(x)) | 
					
						
						|  | df.fillna("", inplace=True) | 
					
						
						|  |  | 
					
						
						|  | clms = ["tob_resume_id", "updated_at"] | 
					
						
						|  |  | 
					
						
						|  | def extract(nms, cc=None): | 
					
						
						|  | nonlocal clms | 
					
						
						|  | clms.extend(nms) | 
					
						
						|  | for c in nms: | 
					
						
						|  | if cc: | 
					
						
						|  | df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c)) | 
					
						
						|  | else: | 
					
						
						|  | df[c] = df["obj"].map( | 
					
						
						|  | lambda x: json.dumps( | 
					
						
						|  | x.get( | 
					
						
						|  | c, | 
					
						
						|  | {}), | 
					
						
						|  | ensure_ascii=False) if isinstance( | 
					
						
						|  | x, | 
					
						
						|  | type( | 
					
						
						|  | {})) and ( | 
					
						
						|  | isinstance( | 
					
						
						|  | x.get(c), | 
					
						
						|  | type( | 
					
						
						|  | {})) or not x.get(c)) else str(x).replace( | 
					
						
						|  | "None", | 
					
						
						|  | "")) | 
					
						
						|  |  | 
					
						
						|  | extract(["education", "work", "certificate", "project", "language", | 
					
						
						|  | "skill"]) | 
					
						
						|  | extract(["wechat", "phone", "is_deleted", | 
					
						
						|  | "name", "tel", "email"], "contact") | 
					
						
						|  | extract(["nation", "expect_industry_name", "salary_month", | 
					
						
						|  | "industry_ids", "is_house", "birth", "annual_salary_from", | 
					
						
						|  | "annual_salary_to", "card", | 
					
						
						|  | "expect_salary_to", "expect_salary_from", | 
					
						
						|  | "expect_position_name", "gender", "city", | 
					
						
						|  | "is_fertility", "expect_city_names", | 
					
						
						|  | "political_status", "title_name", "expect_annual_salary", | 
					
						
						|  | "industry_name", "address", "position_name", "school_name", | 
					
						
						|  | "corporation_id", | 
					
						
						|  | "is_oversea", "responsibilities", | 
					
						
						|  | "work_start_time", "degree", "management_experience", | 
					
						
						|  | "expect_type", "corporation_type", "scale", "corporation_name", | 
					
						
						|  | "self_remark", "annual_salary", "work_experience", | 
					
						
						|  | "discipline_name", "marital", "updated_at"], "basic") | 
					
						
						|  |  | 
					
						
						|  | df["degree"] = df["degree"].map(lambda x: degrees.get_name(x)) | 
					
						
						|  | df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x))) | 
					
						
						|  | df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in | 
					
						
						|  | str(x).split(",")])) | 
					
						
						|  | clms.append("industry_names") | 
					
						
						|  |  | 
					
						
						|  | def arr2str(a): | 
					
						
						|  | if not a: | 
					
						
						|  | return "" | 
					
						
						|  | if isinstance(a, list): | 
					
						
						|  | a = " ".join([str(i) for i in a]) | 
					
						
						|  | return str(a).replace(",", " ") | 
					
						
						|  |  | 
					
						
						|  | df["expect_industry_name"] = df["expect_industry_name"].map( | 
					
						
						|  | lambda x: arr2str(x)) | 
					
						
						|  | df["gender"] = df["gender"].map( | 
					
						
						|  | lambda x: "男" if x == 'M' else ( | 
					
						
						|  | "女" if x == 'F' else "")) | 
					
						
						|  | for c in ["is_fertility", "is_oversea", "is_house", | 
					
						
						|  | "management_experience", "marital"]: | 
					
						
						|  | df[c] = df[c].map( | 
					
						
						|  | lambda x: '是' if x == 'Y' else ( | 
					
						
						|  | '否' if x == 'N' else "")) | 
					
						
						|  | df["is_management_experience"] = df["management_experience"] | 
					
						
						|  | df["is_marital"] = df["marital"] | 
					
						
						|  | clms.extend(["is_management_experience", "is_marital"]) | 
					
						
						|  |  | 
					
						
						|  | df.fillna("", inplace=True) | 
					
						
						|  | for i in range(len(df)): | 
					
						
						|  | if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip(): | 
					
						
						|  | df.loc[i, "phone"] = df.loc[i, "tel"].strip() | 
					
						
						|  |  | 
					
						
						|  | for n in ["industry_ids", "management_experience", "marital", "tel"]: | 
					
						
						|  | for i in range(len(clms)): | 
					
						
						|  | if clms[i] == n: | 
					
						
						|  | del clms[i] | 
					
						
						|  | break | 
					
						
						|  |  | 
					
						
						|  | clms = list(set(clms)) | 
					
						
						|  |  | 
					
						
						|  | df = df.reindex(sorted(clms), axis=1) | 
					
						
						|  |  | 
					
						
						|  | for c in clms: | 
					
						
						|  | df[c] = df[c].map( | 
					
						
						|  | lambda s: str(s).replace( | 
					
						
						|  | "\t", | 
					
						
						|  | " ").replace( | 
					
						
						|  | "\n", | 
					
						
						|  | "\\n").replace( | 
					
						
						|  | "\r", | 
					
						
						|  | "\\n")) | 
					
						
						|  |  | 
					
						
						|  | return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0])) | 
					
						
						|  |  |