Transform and filter array of structs with parent struct field name.

huangapple go评论103阅读模式
英文:

Transform and filter array of structs with parent struct field name

问题

以下是您要翻译的内容:

I am trying to do one more step further than this StackOverflow post (https://stackoverflow.com/questions/74299990/convert-struct-of-structs-to-array-of-structs-pulling-struct-field-name-inside) where I need to pull the struct field name, filter each struct array based on a condition of role values and transform each struct element into a new struct with the extracted struct field name.

Input:

  1. |-- a: array (nullable = true)
  2. | |-- element: struct (containsNull = true)
  3. | | |-- struct_key: string (nullable = true)
  4. | | |-- two: array (nullable = true)
  5. | | | |-- element: struct (containsNull = true)
  6. | | | | |-- name: string (nullable = true)
  7. | | | | |-- role: string (nullable = true)
  8. | |-- element: struct (containsNull = true)
  9. | | |-- struct_key: string (nullable = true)
  10. | | |-- two: array (nullable = true)
  11. | | | |-- element: struct (containsNull = true)
  12. | | | | |-- name: string (nullable = true)
  13. | | | | |-- role: string (nullable = true)
  1. {
  2. "a": [{
  3. "two": [{
  4. "name": "person1",
  5. "role": "role1"
  6. },
  7. {
  8. "name": "person2",
  9. "role": "role1"
  10. },
  11. {
  12. "name": "person3",
  13. "role": "role2"
  14. }],
  15. "struct_key": "test1"
  16. },
  17. {
  18. "two": [{
  19. "name": "person4",
  20. "role": "role1"
  21. },
  22. {
  23. "name": "person5",
  24. "role": "role1"
  25. },
  26. {
  27. "name": "person6",
  28. "role": "role2"
  29. }],
  30. "struct_key": "test2"
  31. }
  32. ]
  33. }
  1. input ={
  2. "a": [{
  3. "two": [{
  4. "name": "person1",
  5. "role": "role1"
  6. },
  7. {
  8. "name": "person2",
  9. "role": "role1"
  10. },
  11. {
  12. "name": "person3",
  13. "role": "role2"
  14. }
  15. ],
  16. "struct_key": "test1"
  17. },
  18. {
  19. "two": [{
  20. "name": "person4",
  21. "role": "role1"
  22. },
  23. {
  24. "name": "person5",
  25. "role": "role1"
  26. },
  27. {
  28. "name": "person6",
  29. "role": "role2"
  30. }
  31. ],
  32. "struct_key": "test2"
  33. }
  34. ]
  35. }
  36. df = spark.read.json(sc.parallelize([input]))
  37. print(df.selectExpr('inline(a)').schema)

Expected output after filtering (for roles) and new struct transformation:

  1. |-- role_output: array (nullable = true)
  2. | |-- element: struct (containsNull = true)
  3. | | |-- struct_key: string (nullable = true)
  4. | | |-- name: string (nullable = true)
  1. {
  2. "role1_output": [
  3. {
  4. "struct_key": "test1",
  5. "name": "person1"
  6. },
  7. {
  8. "struct_key": "test1",
  9. "name": "person2"
  10. },
  11. {
  12. "struct_key": "test2",
  13. "name": "person4"
  14. },
  15. {
  16. "struct_key": "test2",
  17. "name": "person5"
  18. }
  19. ]
  20. }
  21. {
  22. "role2_output": [
  23. {
  24. "struct_key": "test1",
  25. "name": "person3"
  26. },
  27. {
  28. "struct_key": "test2",
  29. "name": "person6"
  30. }
  31. ]
  32. }

我已经为您提供了所需的翻译部分,没有包括其他内容。如果您需要更多帮助,请随时告诉我。

英文:

I am trying to do one more step further than this StackOverflow post (https://stackoverflow.com/questions/74299990/convert-struct-of-structs-to-array-of-structs-pulling-struct-field-name-inside) where I need to pull the struct field name, filter each struct array based on a condition of role values and transform each struct element into a new struct with the extracted struct field name.

Input:

  1. |-- a: array (nullable = true)
  2. | |-- element: struct (containsNull = true)
  3. | | |-- struct_key: string (nullable = true)
  4. | | |-- two: array (nullable = true)
  5. | | | |-- element: struct (containsNull = true)
  6. | | | | |-- name: string (nullable = true)
  7. | | | | |-- role: string (nullable = true)
  8. | |-- element: struct (containsNull = true)
  9. | | |-- struct_key: string (nullable = true)
  10. | | |-- two: array (nullable = true)
  11. | | | |-- element: struct (containsNull = true)
  12. | | | | |-- name: string (nullable = true)
  13. | | | | |-- role: string (nullable = true)
  1. {
  2. "a": [{
  3. "two": [{
  4. "name": "person1"
  5. "role": "role1"
  6. },
  7. {
  8. "name": "person2"
  9. "role": "role1"
  10. },
  11. {
  12. "name": "person3"
  13. "role": "role2"
  14. }],
  15. "struct_key": "test1"
  16. },
  17. {
  18. "two": [{
  19. "name": "person4"
  20. "role": "role1"
  21. },
  22. {
  23. "name": "person5"
  24. "role": "role1"
  25. },
  26. {
  27. "name": "person6"
  28. "role": "role2"
  29. }],
  30. "struct_key": "test2"
  31. }
  32. ]
  33. }
  1. input ={
  2. "a": [{
  3. "two": [{
  4. "name": "person1",
  5. "role": "role1"
  6. },
  7. {
  8. "name": "person2",
  9. "role": "role1"
  10. },
  11. {
  12. "name": "person3",
  13. "role": "role2"
  14. }
  15. ],
  16. "struct_key": "test1"
  17. },
  18. {
  19. "two": [{
  20. "name": "person4",
  21. "role": "role1"
  22. },
  23. {
  24. "name": "person5",
  25. "role": "role1"
  26. },
  27. {
  28. "name": "person6",
  29. "role": "role2"
  30. }
  31. ],
  32. "struct_key": "test2"
  33. }
  34. ]
  35. }
  36. df = spark.read.json(sc.parallelize([input]))
  37. print(df.selectExpr('inline(a)').schema)

Expected output after filtering (for roles) and new struct transformation:

  1. |-- role_output: array (nullable = true)
  2. | |-- element: struct (containsNull = true)
  3. | | |-- struct_key: string (nullable = true)
  4. | | |-- name: string (nullable = true)
  1. {
  2. role1_output: [
  3. {
  4. "struct_key": "test1",
  5. "name": "person1"
  6. },
  7. {
  8. "struct_key": "test1",
  9. "name": "person2"
  10. },
  11. {
  12. "struct_key": "test2",
  13. "name": "person4"
  14. },
  15. {
  16. "struct_key": "test2",
  17. "name": "person5"
  18. }
  19. ]
  20. }
  21. {
  22. role2_output: [
  23. {
  24. "struct_key": "test1",
  25. "name": "person3"
  26. },
  27. {
  28. "struct_key": "test2",
  29. "name": "person6"
  30. }
  31. ]
  32. }

I have tried the struct to map type conversion from that StackOverflow post answer but cannot figure out how to combine the extracted struct_key with another array of struct field values and create a new struct as I start transforming the array element, I lose the struct_key field value. Any advice?

答案1

得分: 0

见下文:

  1. from pyspark.sql.functions import explode
  2. from pyspark.sql.functions import array
  3. from pyspark.sql.functions import struct
  4. from pyspark.sql.functions import collect_list
  5. 输入 = {
  6. "a": [{
  7. "two": [{
  8. "name": "person1",
  9. "role": "role1"
  10. },
  11. {
  12. "name": "person2",
  13. "role": "role1"
  14. },
  15. {
  16. "name": "person3",
  17. "role": "role2"
  18. }
  19. ],
  20. "struct_key": "test1"
  21. },
  22. {
  23. "two": [{
  24. "name": "person4",
  25. "role": "role1"
  26. },
  27. {
  28. "name": "person5",
  29. "role": "role1"
  30. },
  31. {
  32. "name": "person6",
  33. "role": "role2"
  34. }
  35. ],
  36. "struct_key": "test2"
  37. }
  38. ]
  39. }
  40. df = spark.read.json(sc.parallelize([input])).selectExpr('inline(a)').select('struct_key', explode('two')).groupBy('col.role').agg(collect_list(struct('col.name','struct_key')))
  41. df.show(truncate=False)
  42. df.printSchema()
英文:

See below:

  1. from pyspark.sql.functions import explode
  2. from pyspark.sql.functions import array
  3. from pyspark.sql.functions import struct
  4. from pyspark.sql.functions import collect_list
  5. input ={
  6. "a": [{
  7. "two": [{
  8. "name": "person1",
  9. "role": "role1"
  10. },
  11. {
  12. "name": "person2",
  13. "role": "role1"
  14. },
  15. {
  16. "name": "person3",
  17. "role": "role2"
  18. }
  19. ],
  20. "struct_key": "test1"
  21. },
  22. {
  23. "two": [{
  24. "name": "person4",
  25. "role": "role1"
  26. },
  27. {
  28. "name": "person5",
  29. "role": "role1"
  30. },
  31. {
  32. "name": "person6",
  33. "role": "role2"
  34. }
  35. ],
  36. "struct_key": "test2"
  37. }
  38. ]
  39. }
  40. df = spark.read.json(sc.parallelize([input])).selectExpr('inline(a)').select('struct_key', explode('two')).groupBy('col.role').agg(collect_list(struct('col.name','struct_key')))
  41. df.show(truncate=False)
  42. df.printSchema()

Gives you:

  1. >>> df.show(truncate=False)
  2. +-----+------------------------------------------------------------------------+
  3. |role |collect_list(struct(col.name, struct_key)) |
  4. +-----+------------------------------------------------------------------------+
  5. |role2|[{person3, test1}, {person6, test2}] |
  6. |role1|[{person1, test1}, {person2, test1}, {person4, test2}, {person5, test2}]|
  7. +-----+------------------------------------------------------------------------+
  8. >>> df.printSchema()
  9. root
  10. |-- role: string (nullable = true)
  11. |-- collect_list(struct(col.name, struct_key)): array (nullable = false)
  12. | |-- element: struct (containsNull = false)
  13. | | |-- name: string (nullable = true)
  14. | | |-- struct_key: string (nullable = true)

huangapple
  • 本文由 发表于 2023年5月13日 11:49:33
  • 转载请务必保留本文链接:https://go.coder-hub.com/76240980.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定