Spaces:

retopara
/

ragflow

Build error

App Files Files Community

ragflow / deepdoc /parser /resume /step_one.py

zhichyu

Edit chunk shall update instead of insert it (#3709)

1b2aab6 11 months ago

raw

history blame

6.18 kB

	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import json
	from deepdoc.parser.resume.entities import degrees, regions, industries

	FIELDS = [
	"address STRING",
	"annual_salary int",
	"annual_salary_from int",
	"annual_salary_to int",
	"birth STRING",
	"card STRING",
	"certificate_obj string",
	"city STRING",
	"corporation_id int",
	"corporation_name STRING",
	"corporation_type STRING",
	"degree STRING",
	"discipline_name STRING",
	"education_obj string",
	"email STRING",
	"expect_annual_salary int",
	"expect_city_names string",
	"expect_industry_name STRING",
	"expect_position_name STRING",
	"expect_salary_from int",
	"expect_salary_to int",
	"expect_type STRING",
	"gender STRING",
	"industry_name STRING",
	"industry_names STRING",
	"is_deleted STRING",
	"is_fertility STRING",
	"is_house STRING",
	"is_management_experience STRING",
	"is_marital STRING",
	"is_oversea STRING",
	"language_obj string",
	"name STRING",
	"nation STRING",
	"phone STRING",
	"political_status STRING",
	"position_name STRING",
	"project_obj string",
	"responsibilities string",
	"salary_month int",
	"scale STRING",
	"school_name STRING",
	"self_remark string",
	"skill_obj string",
	"title_name STRING",
	"tob_resume_id STRING",
	"updated_at Timestamp",
	"wechat STRING",
	"work_obj string",
	"work_experience int",
	"work_start_time BIGINT"
	]

	def refactor(df):
	def deal_obj(obj, k, kk):
	if not isinstance(obj, type({})):
	return ""
	obj = obj.get(k, {})
	if not isinstance(obj, type({})):
	return ""
	return obj.get(kk, "")

	def loadjson(line):
	try:
	return json.loads(line)
	except Exception:
	pass
	return {}

	df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
	df.fillna("", inplace=True)

	clms = ["tob_resume_id", "updated_at"]

	def extract(nms, cc=None):
	nonlocal clms
	clms.extend(nms)
	for c in nms:
	if cc:
	df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
	else:
	df[c] = df["obj"].map(
	lambda x: json.dumps(
	x.get(
	c,
	{}),
	ensure_ascii=False) if isinstance(
	x,
	type(
	{})) and (
	isinstance(
	x.get(c),
	type(
	{})) or not x.get(c)) else str(x).replace(
	"None",
	""))

	extract(["education", "work", "certificate", "project", "language",
	"skill"])
	extract(["wechat", "phone", "is_deleted",
	"name", "tel", "email"], "contact")
	extract(["nation", "expect_industry_name", "salary_month",
	"industry_ids", "is_house", "birth", "annual_salary_from",
	"annual_salary_to", "card",
	"expect_salary_to", "expect_salary_from",
	"expect_position_name", "gender", "city",
	"is_fertility", "expect_city_names",
	"political_status", "title_name", "expect_annual_salary",
	"industry_name", "address", "position_name", "school_name",
	"corporation_id",
	"is_oversea", "responsibilities",
	"work_start_time", "degree", "management_experience",
	"expect_type", "corporation_type", "scale", "corporation_name",
	"self_remark", "annual_salary", "work_experience",
	"discipline_name", "marital", "updated_at"], "basic")

	df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
	df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
	df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
	str(x).split(",")]))
	clms.append("industry_names")

	def arr2str(a):
	if not a:
	return ""
	if isinstance(a, list):
	a = " ".join([str(i) for i in a])
	return str(a).replace(",", " ")

	df["expect_industry_name"] = df["expect_industry_name"].map(
	lambda x: arr2str(x))
	df["gender"] = df["gender"].map(
	lambda x: "男" if x == 'M' else (
	"女" if x == 'F' else ""))
	for c in ["is_fertility", "is_oversea", "is_house",
	"management_experience", "marital"]:
	df[c] = df[c].map(
	lambda x: '是' if x == 'Y' else (
	'否' if x == 'N' else ""))
	df["is_management_experience"] = df["management_experience"]
	df["is_marital"] = df["marital"]
	clms.extend(["is_management_experience", "is_marital"])

	df.fillna("", inplace=True)
	for i in range(len(df)):
	if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
	df.loc[i, "phone"] = df.loc[i, "tel"].strip()

	for n in ["industry_ids", "management_experience", "marital", "tel"]:
	for i in range(len(clms)):
	if clms[i] == n:
	del clms[i]
	break

	clms = list(set(clms))

	df = df.reindex(sorted(clms), axis=1)
	#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
	for c in clms:
	df[c] = df[c].map(
	lambda s: str(s).replace(
	"\t",
	" ").replace(
	"\n",
	"\\n").replace(
	"\r",
	"\\n"))
	# print(df.values.tolist())
	return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))