英文:
Parse JSON string within dataframe and insert extracted information into another column
问题
你可以使用以下代码将table
, questions
, answers
, answer_type
等输出作为新列添加到原始的 df_sample
数据框中:
import json
import pandas as pd
# 假设你已经有了原始的 df_sample 数据框
# 创建空的列来存储新的数据
df_sample['table_output'] = ""
df_sample['questions_output'] = ""
df_sample['answers_output'] = ""
df_sample['answer_type_output'] = ""
for index, row in df_sample.iterrows():
table_json = row['table']
paragraphs_json = row['paragraphs']
questions_json = row['questions']
table = json.loads(json.dumps(table_json)).get("table")
paragraphs = [json.loads(json.dumps(x)).get("text") for x in paragraphs_json]
questions = [json.loads(json.dumps(x)).get("question") for x in questions_json]
answer = [json.loads(json.dumps(x)).get("answer") for x in questions_json]
answer_type = [json.loads(json.dumps(x)).get("answer_type") for x in questions_json]
# 将提取的数据添加到对应的列
df_sample.at[index, 'table_output'] = table
df_sample.at[index, 'questions_output'] = questions
df_sample.at[index, 'answers_output'] = answer
df_sample.at[index, 'answer_type_output'] = answer_type
# 打印结果
print(df_sample[['table_output', 'questions_output', 'answers_output', 'answer_type_output']])
这将在 df_sample
数据框中添加新的列,分别存储提取的 table
, questions
, answers
, answer_type
数据。
英文:
I am trying to extract information from each cell in a row from a data frame and add them as another column.
import json
import pandas as pd
df_nested = pd.read_json('train.json')
df_sample = df_nested.sample(n=50, random_state=0)
display(df_sample)
for index, row in df_sample.iterrows():
table_json = row['table']
paragraphs_json = row['paragraphs']
questions_json = row['questions']
table = json.loads(json.dumps(table_json)).get("table")
#print(table)
paragraphs = [json.loads(json.dumps(x)).get("text") for x in paragraphs_json]
#print(paragraphs)
questions = [json.loads(json.dumps(x)).get("question") for x in questions_json]
answer = [json.loads(json.dumps(x)).get("answer") for x in questions_json]
answer_type = [json.loads(json.dumps(x)).get("answer_type") for x in questions_json]
program = [json.loads(json.dumps(x)).get("derivation") for x in questions_json]
print(program)
The dataframe is as
table | paragraphs | questions |
---|---|---|
{"uid": "bf2c6a2f-0b76-4bba-8d3c-2ee02d1b7d73", "table": "[[, , December 31,,], [, Useful Life, 2019, 2018], [Computer equipment and software, 3 – 5 years, $57,474, $52,055], [Furniture and fixtures, 7 years, 6,096, 4,367], [Leasehold improvements, 2 – 6 years, 22,800, 9,987], [Renovation in progress, n/a, 8, 1,984], [Build-to-suit property, 25 years, —, 51,058], [Total property and equipment, gross, , 86,378, 119,451], [Less: accumulated depreciation and amortization, , (49,852), (42,197)], [Total property and equipment, net, , $36,526, $77,254]]"} | [{"uid": "07e28145-95d5-4f9f-b313-ac8c3b4a869f", "text": "Accounts Receivable", "order": "1"}, {"uid": "b41652f7-0e68-4cf6-9723-fec443b1e604", "text": "The following is a summary of Accounts receivable (in thousands):", "order": "2"}] | [{"rel_paragraphs": "2", "answer_from": "table-text", "question": "Which years does the table provide information for the company's Accounts receivable?", "scale": "", "answer_type": "multi-span", "req_comparison": "false", "order": "1", "uid": "53041a93-1d06-48fd-a478-6f690b8da302", "answer": "[2019, 2018]", "derivation": ""}, {"rel_paragraphs": "2", "answer_from": "table-text", "question": "What was the amount of accounts receivable in 2018?", "scale": "thousand", "answer_type": "span", "req_comparison": "false", "order": "2", "uid": "a196a61c-43b0-43f5-bb4b-b059a1103c54", "answer": "[225,167]", "derivation": ""}, {"rel_paragraphs": "2", "answer_from": "table-text", "question": "What was the allowance for product returns in 2019?", "scale": "thousand", "answer_type": "span", "req_comparison": "false", "order": "3", "uid": "c8656e5e-2bb7-4f03-ae73-0d04492155c0", "answer": "[(25,897)]", "derivation": ""}, {"rel_paragraphs": "2", "answer_from": "table-text", "question": "How many years did the net accounts receivable exceed $200,000 thousand?", "scale": "", "answer_type": "count", "req_comparison": "false", "order": "4", "uid": "fdf08d3d-d570-4c21-9b3e-a3c86e164665", "answer": "1", "derivation": "2018"}, {"rel_paragraphs": "2", "answer_from": "table-text", "question": "What was the change in the Allowance for doubtful accounts between 2018 and 2019?", "scale": "thousand", "answer_type": "arithmetic", "req_comparison": "false", "order": "5", "uid": "6ecb2062-daca-4e1e-900e-2b99b2fce929", "answer": "424", "derivation": "-1,054-(-1,478)"}, {"rel_paragraphs": "[]", "answer_from": "table", "question": "What was the percentage change in the Allowance for product returns between 2018 and 2019?", "scale": "percent", "answer_type": "arithmetic", "req_comparison": "false", "order": "6", "uid": "f2c1edad-622d-4959-8cd5-a7f2bd2d7bb1", "answer": "129.87", "derivation": "(-25,897+11,266)/-11,266"}] |
The above code is not an efficient one. But, how do I add the outputs from the df_sample.iterrows()
i.e. table, questions, answers, answer_type
etc.. as another column in my original df_sample
dataframe
答案1
得分: 1
以下是您提供的数据框:
import pandas as pd
df = pd.DataFrame(
{
"table": [
{
"uid": "bf2c6a2f-0b76-4bba-8d3c-2ee02d1b7d73",
"table": "[[, , December 31,,], [, Useful Life, 2019, 2018], [Computer equipment and software, 3 â\x80\x93 5 years, $57,474, $52,055], [Furniture and fixtures, 7 years, 6,096, 4,367], [Leasehold improvements, 2 â\x80\x93 6 years, 22,800, 9,987], [Renovation in progress, n/a, 8, 1,984], [Build-to-suit property, 25 years, â\x80\x94, 51,058], [Total property and equipment, gross, , 86,378, 119,451], [Less: accumulated depreciation and amortization, , (49,852), (42,197)], [Total property and equipment, net, , $36,526, $77,254]]",
}
],
"paragraphs": [
[
{
"uid": "07e28145-95d5-4f9f-b313-ac8c3b4a869f",
"text": "Accounts Receivable",
"order": "1",
},
{
"uid": "b41652f7-0e68-4cf6-9723-fec443b1e604",
"text": "The following is a summary of Accounts receivable (in thousands):",
"order": "2",
},
]
],
"questions": [
[
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "Which years does the table provide information for the company's Accounts receivable?",
"scale": "",
"answer_type": "multi-span",
"req_comparison": "false",
"order": "1",
"uid": "53041a93-1d06-48fd-a478-6f690b8da302",
"answer": "[2019, 2018]",
"derivation": "",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "What was the amount of accounts receivable in 2018?",
"scale": "thousand",
"answer_type": "span",
"req_comparison": "false",
"order": "2",
"uid": "a196a61c-43b0-43f5-bb4b-b059a1103c54",
"answer": "[225,167]",
"derivation": "",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "What was the allowance for product returns in 2019?",
"scale": "thousand",
"answer_type": "span",
"req_comparison": "false",
"order": "3",
"uid": "c8656e5e-2bb7-4f03-ae73-0d04492155c0",
"answer": "[(25,897)]",
"derivation": "",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "How many years did the net accounts receivable exceed $200,000 thousand?",
"scale": "",
"answer_type": "count",
"req_comparison": "false",
"order": "4",
"uid": "fdf08d3d-d570-4c21-9b3e-a3c86e164665",
"answer": "1",
"derivation": "2018",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "What was the change in the Allowance for doubtful accounts between 2018 and 2019?",
"scale": "thousand",
"answer_type": "arithmetic",
"req_comparison": "false",
"order": "5",
"uid": "6ecb2062-daca-4e1e-900e-2b99b2fce929",
"answer": "424",
"derivation": "-1,054-(-1,478)",
},
{
"rel_paragraphs": "[]",
"answer_from": "table",
"question": "What was the percentage change in the Allowance for product returns between 2018 and 2019?",
"scale": "percent",
"answer_type": "arithmetic",
"req_comparison": "false",
"order": "6",
"uid": "f2c1edad-622d-4959-8cd5-a7f2bd2d7bb1",
"answer": "129.87",
"derivation": "(-25,897+11,266)/-11,266",
},
]
],
}
)
这是一种使用Python内置函数isinstance
、"walrus"以及Pandas的explode
、json_normalize
和concat
的方法:
for col in df.columns:
# 处理包含JSON列表的列
if df[col].apply(lambda x: isinstance(x, list)).all():
df = df.explode(col, ignore_index=True)
# 处理JSON
if not (new_cols := pd.json_normalize(df[col])).empty:
df = pd.concat([df.drop(columns=col), new_cols], axis=1).drop(columns="uid")
然后:
print(df)
# 输出
请注意,上述代码和数据框是以英文编写的,您可以根据需要进行翻译。如果您需要进一步的帮助,请告诉我。
英文:
With the dataframe you provided:
import pandas as pd
df = pd.DataFrame(
{
"table": [
{
"uid": "bf2c6a2f-0b76-4bba-8d3c-2ee02d1b7d73",
"table": "[[, , December 31,,], [, Useful Life, 2019, 2018], [Computer equipment and software, 3 â\x80\x93 5 years, $57,474, $52,055], [Furniture and fixtures, 7 years, 6,096, 4,367], [Leasehold improvements, 2 â\x80\x93 6 years, 22,800, 9,987], [Renovation in progress, n/a, 8, 1,984], [Build-to-suit property, 25 years, â\x80\x94, 51,058], [Total property and equipment, gross, , 86,378, 119,451], [Less: accumulated depreciation and amortization, , (49,852), (42,197)], [Total property and equipment, net, , $36,526, $77,254]]",
}
],
"paragraphs": [
[
{
"uid": "07e28145-95d5-4f9f-b313-ac8c3b4a869f",
"text": "Accounts Receivable",
"order": "1",
},
{
"uid": "b41652f7-0e68-4cf6-9723-fec443b1e604",
"text": "The following is a summary of Accounts receivable (in thousands):",
"order": "2",
},
]
],
"questions": [
[
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "Which years does the table provide information for the company's Accounts receivable?",
"scale": "",
"answer_type": "multi-span",
"req_comparison": "false",
"order": "1",
"uid": "53041a93-1d06-48fd-a478-6f690b8da302",
"answer": "[2019, 2018]",
"derivation": "",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "What was the amount of accounts receivable in 2018?",
"scale": "thousand",
"answer_type": "span",
"req_comparison": "false",
"order": "2",
"uid": "a196a61c-43b0-43f5-bb4b-b059a1103c54",
"answer": "[225,167]",
"derivation": "",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "What was the allowance for product returns in 2019?",
"scale": "thousand",
"answer_type": "span",
"req_comparison": "false",
"order": "3",
"uid": "c8656e5e-2bb7-4f03-ae73-0d04492155c0",
"answer": "[(25,897)]",
"derivation": "",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "How many years did the net accounts receivable exceed $200,000 thousand?",
"scale": "",
"answer_type": "count",
"req_comparison": "false",
"order": "4",
"uid": "fdf08d3d-d570-4c21-9b3e-a3c86e164665",
"answer": "1",
"derivation": "2018",
},
{
"rel_paragraphs": "[2]",
"answer_from": "table-text",
"question": "What was the change in the Allowance for doubtful accounts between 2018 and 2019?",
"scale": "thousand",
"answer_type": "arithmetic",
"req_comparison": "false",
"order": "5",
"uid": "6ecb2062-daca-4e1e-900e-2b99b2fce929",
"answer": "424",
"derivation": "-1,054-(-1,478)",
},
{
"rel_paragraphs": "[]",
"answer_from": "table",
"question": "What was the percentage change in the Allowance for product returns between 2018 and 2019?",
"scale": "percent",
"answer_type": "arithmetic",
"req_comparison": "false",
"order": "6",
"uid": "f2c1edad-622d-4959-8cd5-a7f2bd2d7bb1",
"answer": "129.87",
"derivation": "(-25,897+11,266)/-11,266",
},
]
],
}
)
Here is one way to do it with Python built-in function isinstance and "walrus" as well as Pandas explode, json_normalize and concat:
for col in df.columns:
# Deal with columns containing lists of json
if df[col].apply(lambda x: isinstance(x, list)).all():
df = df.explode(col, ignore_index=True)
# Deal with json
if not (new_cols := pd.json_normalize(df[col])).empty:
df = pd.concat([df.drop(columns=col), new_cols], axis=1).drop(columns="uid")
Then:
print(df)
# Output
table text order rel_paragraphs
0 [[, , December 31,,],... Accounts Receivable 1 [2] \
1 [[, , December 31,,],... Accounts Receivable 1 [2]
2 [[, , December 31,,],... Accounts Receivable 1 [2]
3 [[, , December 31,,],... Accounts Receivable 1 [2]
4 [[, , December 31,,],... Accounts Receivable 1 [2]
5 [[, , December 31,,],... Accounts Receivable 1 []
6 [[, , December 31,,],... The following is a su... 2 [2]
7 [[, , December 31,,],... The following is a su... 2 [2]
8 [[, , December 31,,],... The following is a su... 2 [2]
9 [[, , December 31,,],... The following is a su... 2 [2]
10 [[, , December 31,,],... The following is a su... 2 [2]
11 [[, , December 31,,],... The following is a su... 2 []
answer_from question scale answer_type req_comparison
0 table-text Which years does the ... multi-span false \
1 table-text What was the amount o... thousand span false
2 table-text What was the allowanc... thousand span false
3 table-text How many years did th... count false
4 table-text What was the change i... thousand arithmetic false
5 table What was the percenta... percent arithmetic false
6 table-text Which years does the ... multi-span false
7 table-text What was the amount o... thousand span false
8 table-text What was the allowanc... thousand span false
9 table-text How many years did th... count false
10 table-text What was the change i... thousand arithmetic false
11 table What was the percenta... percent arithmetic false
order answer derivation
0 1 [2019, 2018]
1 2 [225,167]
2 3 [(25,897)]
3 4 1 2018
4 5 424 -1,054-(-1,478)
5 6 129.87 (-25,897+11,266)/-11,266
6 1 [2019, 2018]
7 2 [225,167]
8 3 [(25,897)]
9 4 1 2018
10 5 424 -1,054-(-1,478)
11 6 129.87 (-25,897+11,266)/-11,266
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论