英文:
How to get first string value with numbers in pyspark array
问题
I want to extract just the first numeric instance from the languages field as a date in another column of the pyspark dataframe.
Sample data
data = [
("James","Java_Scala_C++_20230510_2023051345"),
("Mindy", "Spark_Java_20211014_20211014255_C++"),
("Julia", "CSharp_20200115_VB")
]
from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([
StructField("name",StringType(),True),
StructField("languages",StringType(),True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.display()
By using split on the column, I can split the field into an array with what I'm looking for. I can use to_date to convert the string to a date, but would like help selecting the first instance of the numeric field without hardcoding an index which wouldn't work since the number values are in different indexes. I tried regexp_extract which doesn't work with arrays.
from pyspark.sql.functions import *
df = df\
.withColumn('languages_split', split(col('languages'), '_'))
df.display()
Desired output two columns with the following values. String names and dates.
James: 20230510
Mindy: 20211014
Julia: 20200115
英文:
I want to extract just the first numeric instance from the languages field as a date in another column of the pyspark dataframe.
Sample data
data = [
("James","Java_Scala_C++_20230510_2023051345"),
("Mindy", "Spark_Java_20211014_20211014255_C++"),
("Julia", "CSharp_20200115_VB")
]
from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([
StructField("name",StringType(),True),
StructField("languages",StringType(),True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.display()
By using split on the column, I can split the field into an array with what I'm looking for. I can use to_date to convert the string to a date, but would like help selecting the first instance of the numeric field without hardcoding an index which wouldn't work since the number values are in different indexes. I tried regexp_extract which doesn't work with arrays.
from pyspark.sql.functions import *
df = df\
.withColumn('languages_split', split(col('languages'), '_'))
df.display()
Desired output two columns with the following values. String names and dates.
James: 20230510
Mindy: 20211014
Julia: 20200115
答案1
得分: 1
尝试使用**regexp_extract
**和第1
捕获组。
_(\d{1,8})_
-> 捕获以_
开头,包含8
位数字,以_
结尾的组。
示例:
data = [
("James","Java_Scala_C++_20230510_2023051345"),
("Mindy", "Spark_Java_20211014_20211014255_C++"),
("Julia", "CSharp_20200115_VB")
]
from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([
StructField("name",StringType(),True),
StructField("languages",StringType(),True)
])
df = spark.createDataFrame(data=data,schema=schema)
from pyspark.sql.functions import *
df = df\
.withColumn('languages_split', regexp_extract(col("languages"),"_(\d{1,8})_",1))
df.show(10,False)
#+-----+-----------------------------------+---------------+
#|name |languages |languages_split|
#+-----+-----------------------------------+---------------+
#|James|Java_Scala_C++_20230510_2023051345 |20230510 |
#|Mindy|Spark_Java_20211014_20211014255_C++|20211014 |
#|Julia|CSharp_20200115_VB |20200115 |
#+-----+-----------------------------------+---------------+
英文:
Try with regexp_extract
with the 1
capture group.
_(\d{1,8})_
-> capture the group starts with _
and get 8
digits and end with _
.
Example:
data = [
("James","Java_Scala_C++_20230510_2023051345"),
("Mindy", "Spark_Java_20211014_20211014255_C++"),
("Julia", "CSharp_20200115_VB")
]
from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([
StructField("name",StringType(),True),
StructField("languages",StringType(),True)
])
df = spark.createDataFrame(data=data,schema=schema)
from pyspark.sql.functions import *
df = df\
.withColumn('languages_split', regexp_extract(col("languages"),"_(\d{1,8})_",1))
df.show(10,False)
#+-----+-----------------------------------+---------------+
#|name |languages |languages_split|
#+-----+-----------------------------------+---------------+
#|James|Java_Scala_C++_20230510_2023051345 |20230510 |
#|Mindy|Spark_Java_20211014_20211014255_C++|20211014 |
#|Julia|CSharp_20200115_VB |20200115 |
#+-----+-----------------------------------+---------------+
答案2
得分: 1
如果您需要提取和解析日期,可以使用以下代码:
```python
import datetime as dt
from pyspark.sql.functions import *
from pyspark.sql.types import DateType
def parse_date(x):
try:
return dt.datetime.strptime(x, '%Y%m%d')
except ValueError as err:
return None
parse_date_udf = udf(lambda arr: next(parse_date(x) for x in arr if parse_date(x) is not None), DateType())
df = df.withColumn('date', parse_date_udf(split(col('languages'), '_')))
df.show(100, False)
+-----+-----------------------------------+----------+
|name |languages |date |
+-----+-----------------------------------+----------+
|James|Java_Scala_C++_20230510_2023051345 |2023-05-10|
|Mindy|Spark_Java_20211014_20211014255_C++|2021-10-14|
|Julia|CSharp_20200115_VB |2020-01-15|
+-----+-----------------------------------+----------+
<details>
<summary>英文:</summary>
If you need to extract and parse the date you can do:
```python
import datetime as dt
from pyspark.sql.functions import *
from pyspark.sql.types import DateType
def parse_date(x):
try:
return dt.datetime.strptime(x, '%Y%m%d')
except ValueError as err:
return None
parse_date_udf = udf(lambda arr: next(parse_date(x) for x in arr if parse_date(x) is not None), DateType())
df = df.withColumn('date', parse_date_udf(split(col('languages'), '_')))
df.show(100, False)
+-----+-----------------------------------+----------+
|name |languages |date |
+-----+-----------------------------------+----------+
|James|Java_Scala_C++_20230510_2023051345 |2023-05-10|
|Mindy|Spark_Java_20211014_20211014255_C++|2021-10-14|
|Julia|CSharp_20200115_VB |2020-01-15|
+-----+-----------------------------------+----------+
答案3
得分: 0
以下是您要翻译的内容:
"for folks not wanting a regex solution, you can split the string and use filter
higher order function.
see example
data_sdf. \
withColumn('lang_split_arr', func.split('lang', '_')). \
withColumn('lang_arr_nums_only',
func.filter('lang_split_arr', lambda x: x.cast('int').isNotNull())
). \
withColumn('lang_dt_only',
func.to_date(func.col('lang_arr_nums_only').getItem(0), 'yyyyMMdd')
). \
show(truncate=False)
# +-----+-----------------------------------+-----------------------------------------+----------------------+------------+
# |name |lang |lang_split_arr |lang_arr_nums_only |lang_dt_only|
# +-----+-----------------------------------+-----------------------------------------+----------------------+------------+
# |James|Java_Scala_C++_20230510_2023051345 |[Java, Scala, C++, 20230510, 2023051345] |[20230510, 2023051345]|2023-05-10 |
# |Mindy|Spark_Java_20211014_20211014255_C++|[Spark, Java, 20211014, 20211014255, C++]|[20211014] |2021-10-14 |
# |Julia|CSharp_20200115_VB |[CSharp, 20200115, VB] |[20200115] |2020-01-15 |
# +-----+-----------------------------------+-----------------------------------------+----------------------+------------+
英文:
for folks not wanting a regex solution, you can split the string and use filter
higher order function.
see example
data_sdf. \
withColumn('lang_split_arr', func.split('lang', '_')). \
withColumn('lang_arr_nums_only',
func.filter('lang_split_arr', lambda x: x.cast('int').isNotNull())
). \
withColumn('lang_dt_only',
func.to_date(func.col('lang_arr_nums_only').getItem(0), 'yyyyMMdd')
). \
show(truncate=False)
# +-----+-----------------------------------+-----------------------------------------+----------------------+------------+
# |name |lang |lang_split_arr |lang_arr_nums_only |lang_dt_only|
# +-----+-----------------------------------+-----------------------------------------+----------------------+------------+
# |James|Java_Scala_C++_20230510_2023051345 |[Java, Scala, C++, 20230510, 2023051345] |[20230510, 2023051345]|2023-05-10 |
# |Mindy|Spark_Java_20211014_20211014255_C++|[Spark, Java, 20211014, 20211014255, C++]|[20211014] |2021-10-14 |
# |Julia|CSharp_20200115_VB |[CSharp, 20200115, VB] |[20200115] |2020-01-15 |
# +-----+-----------------------------------+-----------------------------------------+----------------------+------------+
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论