英文:
Replace column value substring with hash of substring in PySpark
问题
I have a dataframe with a column containing a description including customer ids which I need to replace with their sha2
hashed version.
Example: the column value "X customer 0013120109 in country AU." should be turned into "X customer d8e824e6a2d5b32830c93ee0ca690ac6cb976cc51706b1a856cd1a95826bebd in country AU."
MRE:
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col, sha2, regexp_replace, lit, concat
from pyspark.sql.types import LongType, StringType, StructField, StructType
data = [
[1, "Sold device 11312."],
[2, "X customer 0013120109 in country AU."],
[3, "Y customer 0013140033 in country BR."],
]
schema = StructType(
[
StructField(name="Id", dataType=LongType()),
StructField(name="Description", dataType=StringType())
]
)
df = spark.createDataFrame(data=data, schema=schema)
My attempted solution was to use regexp_replace
in combination with regexp_extract
, but it expects a concrete string as "replacement" value - while my replacement value would be dynamic.
df = (
df
.withColumn("Description", regexp_replace("Description",
r"customer \d+",
concat(lit("customer "),
sha2(regexp_extract(
"Description",
r".* customer (\d+) .*",
1),
256
)
)
)
)
)
PS: I really want to avoid UDFs since the transformation from JVM to Python and back is a huge performance degradation.
英文:
I have a dataframe with a column containing a description including customer ids which I need to replace with their sha2
hashed version.
Example: the column value "X customer 0013120109 in country AU.
should be turned into "X customer d8e824e6a2d5b32830c93ee0ca690ac6cb976cc51706b1a856cd1a95826bebd in country AU.
MRE:
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col, sha2, regexp_replace, lit, concat
from pyspark.sql.types import LongType, StringType, StructField, StructType
data = [
[1, "Sold device 11312."],
[2, "X customer 0013120109 in country AU."],
[3, "Y customer 0013140033 in country BR."],
]
schema = StructType(
[
StructField(name="Id", dataType=LongType()),
StructField(name="Description", dataType=StringType())
]
)
df = spark.createDataFrame(data=data, schema=schema)
My attempted solution was to use regexp_replace
in combination with regexp_extract
, but it expects a concrete string as "replacement" value - while my replacement value would be dynamic.
df = (
df
.withColumn("Description", regexp_replace("Description",
r"customer \d+",
concat(lit("customer "),
sha2(regexp_extract(
"Description",
r".* customer (\d+) .*",
1),
256
)
)
)
)
)
PS: I really want to avoid UDFs since the transformation from JVM to Python and back is a huge performance degradation...
答案1
得分: 0
成功找到一个解决方案,使用concat
、substr
和expr
的组合:
df = (
df
.withColumn("regexp_extract",
regexp_extract('Description', '.* customer (\d+) .*', 1)
)
.withColumn("NewColumn",
when(expr("length(regexp_extract)") > lit(1),
# Concatenate the substrings: [before_id, hashed_id, after_id]
concat(
# From the start of the column until the index of the extract.
col("Description").substr(lit(0), expr("instr(Description, regexp_extract)") - lit(1)),
# Hashed extract
sha2(regexp_extract('Description', '.* customer (\d+) .*', 1), 256),
# Subtr from (Index extract + length of the extract) until the end of the Column.
col("Description").substr(expr("instr(Description, regexp_extract)") +
expr("length(regexp_extract)"), expr("length(Description)"))
)
).otherwise(col("Description"))
)
.drop("regexp_extract")
)
英文:
Managed to find a solution using a combination of concat
, substr
and expr
:
df = (
df
.withColumn("regexp_extract",
regexp_extract('Description', '.* customer (\\d+) .*', 1)
)
.withColumn("NewColumn",
when(expr("length(regexp_extract)") > lit(1),
# Concatenate the substrings: [before_id, hashed_id, after_id]
concat(
# From the start of the column until the index of the extract.
col("Description").substr(lit(0), expr("instr(Description, regexp_extract)") - lit(1)),
# Hashed extract
sha2(regexp_extract('Description', '.* customer (\\d+) .*', 1), 256),
# Subtr from (Index extract + length of the extract) until the end of the Column.
col("Description").substr(expr("instr(Description, regexp_extract)") +
expr("length(regexp_extract)"), expr("length(Description)"))
)
).otherwise(col("Description"))
)
.drop("regexp_extract")
)
答案2
得分: 0
这是我的两分建议:
-
方法非常简单,将字符串分为三个部分:
- 在客户 ID 之前的部分
- 客户 ID
- 客户 ID 之后的部分
-
然后对客户 ID 进行遮蔽,并连接这三个部分。
代码:
df_final = df.withColumn("regexp_extract1",
when(regexp_extract('Description', '^(.* customer) (\d+)(.*)$', 1) != '',
regexp_extract('Description', '^(.* customer) (\d+)(.*)$', 1))
.otherwise(col('Description')))
.withColumn("regexp_extract2", regexp_extract('Description', '^(.* customer) (\d+)(.*)$', 2))
.withColumn("regexp_extract3", regexp_extract('Description', '^(.* customer) (\d+)(.*)$', 3))
.withColumn("extract2_sha2", when(col('regexp_extract2') != '', sha2("regexp_extract2", 256)).otherwise(''))
.withColumn('Masked Description', F.concat(col('regexp_extract1'), lit(' '), col('extract2_sha2'), col('regexp_extract3')))
.drop(*['regexp_extract1', 'regexp_extract2', 'regexp_extract3', 'extract2_sha2'])
df_final.show(truncate=False)
输出:
+---+------------------------------------+------------------------------------------------------------------------------------------+
|Id |Description |Masked Description |
+---+------------------------------------+------------------------------------------------------------------------------------------+
|1 |Sold device 11312. |Sold device 11312. |
|2 |X customer 0013120109 in country AU.|X customer d8e824e6a2d5b32830c93ee0ca690ac6cb976cc51706b1a856cd1a95826bebdb in country AU.|
|3 |Y customer 0013140033 in country BR.|Y customer 2f4ab0aeb1f3332b8b9ccdd3a9fca759f267074e6621e5362acdd6f22211f167 in country BR.|
+---+------------------------------------+------------------------------------------------------------------------------------------+
英文:
Here are my 2 cents:
- Approach is quite simple, split the string into 3 parts:
- One with anything before the customer id
- customer id
- Anything after customer id.
- Then mask the customer id, and concat all 3 of them.
Code:
df_final = df .withColumn("regexp_extract1",
when(regexp_extract('Description', '^(.* customer) (\\d+)(.*)$', 1) != '',
regexp_extract('Description', '^(.* customer) (\\d+)(.*)$', 1))\
.otherwise(col('Description')))\
.withColumn("regexp_extract2", regexp_extract('Description', '^(.* customer) (\\d+)(.*)$', 2))\
.withColumn("regexp_extract3", regexp_extract('Description', '^(.* customer) (\\d+)(.*)$', 3))\
.withColumn("extract2_sha2",when(col('regexp_extract2')!='',sha2("regexp_extract2",256)).otherwise(''))\
.withColumn('Masked Description',F.concat(col('regexp_extract1'),lit(' '),col('extract2_sha2'),col('regexp_extract3')))\
.drop(*['regexp_extract1','regexp_extract2','regexp_extract3','extract2_sha2'])
df_final.show(truncate=False)
Output:
+---+------------------------------------+------------------------------------------------------------------------------------------+
|Id |Description |Masked Description |
+---+------------------------------------+------------------------------------------------------------------------------------------+
|1 |Sold device 11312. |Sold device 11312. |
|2 |X customer 0013120109 in country AU.|X customer d8e824e6a2d5b32830c93ee0ca690ac6cb976cc51706b1a856cd1a95826bebdb in country AU.|
|3 |Y customer 0013140033 in country BR.|Y customer 2f4ab0aeb1f3332b8b9ccdd3a9fca759f267074e6621e5362acdd6f22211f167 in country BR.|
+---+------------------------------------+------------------------------------------------------------------------------------------+
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论