英文:
Create pandas dataframe column with random conditional numbers
问题
我已创建了以下的pandas数据框。
import pandas as pd
import numpy as np
ds = {'col1' : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
      'col2' : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
data = pd.DataFrame(data=ds)
它看起来像这样:
print(data)
    col1  col2
0      1     0
1      1     0
2      1     0
3      1     0
4      1     0
5      1     0
6      1     0
7      1     0
8      1     0
9      1     0
10     1     0
11     1     0
12     1     0
13     1     0
14     2     1
15     2     1
16     2     1
17     2     1
18     2     1
19     2     1
20     2     1
21     2     1
22     2     1
23     2     1
24     2     1
25     2     1
26     2     1
27     2     1
我需要创建一个新的列(称为 col3),根据以下条件:
- 
当
col1 = 1时,有14条记录,其中col2 = 0。新列(即col3)需要有14条记录中的50%的值等于col2(在这14条记录中随机分布),剩余的50%等于1。 - 
当
col1 = 2时,有14条记录,其中col2 = 1。新列(即col3)需要有14条记录中的50%的值等于col2(在这14条记录中随机分布),剩余的50%等于0。 
因此,最终的数据集将如下所示(请注意,col3 中的值的位置或记录是随机分配的)。
你需要的Python代码如下:
import random
# Define a function to generate col3 values based on the conditions
def generate_col3(row):
    if row['col1'] == 1:
        if row['col2'] == 0:
            return [random.choice([0, 1]) for _ in range(14)]
        else:
            return [1] * 14
    elif row['col1'] == 2:
        if row['col2'] == 1:
            return [random.choice([0, 1]) for _ in range(14)]
        else:
            return [0] * 14
# Apply the function to create the col3 column
data['col3'] = data.apply(generate_col3, axis=1)
# Explode the lists in col3 to separate rows
data = data.explode('col3', ignore_index=True)
# Shuffle the rows to randomize the order
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
print(data)
这将生成符合您要求的数据框。
英文:
I have created the following pandas dataframe.
import pandas as pd
import numpy as np
ds = {'col1' : [1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2],
      'col2' : [0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1]}
data = pd.DataFrame(data=ds)
which looks like this:
print(data)
    col1  col2
0      1     0
1      1     0
2      1     0
3      1     0
4      1     0
5      1     0
6      1     0
7      1     0
8      1     0
9      1     0
10     1     0
11     1     0
12     1     0
13     1     0
14     2     1
15     2     1
16     2     1
17     2     1
18     2     1
19     2     1
20     2     1
21     2     1
22     2     1
23     2     1
24     2     1
25     2     1
26     2     1
27     2     1
I need to create a new column (called col3) subject to the following conditions:
- 
when
col1 = 1, there are 14 records for which col2 = 0. The new column (i.e.col3), needs to have 50% (of exactly those 14 records) of the values equal tocol2(randomly distributed across the 14 records) and the remaining 50% equal to 1. - 
when
col1 = 2, there are 14 records for whichcol2 = 1. The new column (i.e.col3), needs to have 50% (of exactly those 14 records) of the values equal tocol2(randomly distributed across the 14 records) and the remaining 50% equal to 0. 
So, the resulting dataset would look like this (bear in mind that the location - or record - of the values in col3 is randomly assigned):
Does anyone know the python code to produce such dataframe?
答案1
得分: 2
# 从col1的每个唯一值中随机抽取50%的样本到col3中
data['col3'] = data.groupby('col1')['col2'].sample(frac=.5)
# 使用预定义的col1值的映射填充剩余的50%
data['col3'] = data['col3'].fillna(data['col1'].map({1: 1, 2: 0}), downcast='infer')
英文:
groupby + sample
# take a sample of 50% from col2 per unique value in col1
data['col3'] = data.groupby('col1')['col2'].sample(frac=.5)
# fill the remaining 50% using a predefined mapping of col1 value
data['col3'] = data['col3'].fillna(data['col1'].map({1: 1, 2: 0}), downcast='infer')
Result
    col1  col2  col3
0      1     0     1
1      1     0     0
2      1     0     0
3      1     0     0
4      1     0     0
5      1     0     0
6      1     0     0
7      1     0     1
8      1     0     0
9      1     0     1
10     1     0     1
11     1     0     1
12     1     0     1
13     1     0     1
14     2     1     1
15     2     1     0
16     2     1     1
17     2     1     0
18     2     1     0
19     2     1     1
20     2     1     1
21     2     1     0
22     2     1     1
23     2     1     0
24     2     1     0
25     2     1     1
26     2     1     1
27     2     1     0
答案2
得分: 1
我将为您翻译代码部分,以下是翻译好的内容:
# 使用df.sample()方法在条件内随机分离所有子组,然后分配值
ds = {'col1': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
      'col2': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
data = pd.DataFrame(data=ds)
data['col_3'] = 0
# 设置数据帧并创建一个新的空列
# 现在让我们根据条件进行选择
cond_1 = data.loc[(data.col1 == 1) & (data.col2 == 0)]
cond_2 = data.loc[(data.col1 == 2) & (data.col2 == 1)]
# 获取第一个条件的随机50%,然后获取剩下的50%
cond_1_A = cond_1.sample(frac=0.5)
cond_1_B = cond_1.loc[cond_1.index.difference(cond_1_A.index)]
# 对于每个子组,将值设置为0或1
data.col_3.loc[cond_1_A.index] = 0
data.col_3.loc[cond_1_B.index] = 1
# 第二个条件 - 同样的操作
cond_2_A = cond_2.sample(frac=0.5)
cond_2_B = cond_2.loc[cond_2.index.difference(cond_2_A.index)]
data.col_3.loc[cond_2_A.index] = 0
data.col_3.loc[cond_2_B.index] = 1
# 完成
运行1:
   col1  col2  col_3
0     1     0      0
1     1     0      1
2     1     0      1
3     1     0      0
4     1     0      1
5     1     0      1
6     1     0      0
7     1     0      0
8     1     0      0
9     1     0      1
10    1     0      0
11    1     0      0
12    1     0      1
13    1     0      1
14    2     1      1
15    2     1      1
16    2     1      0
17    2     1      0
18    2     1      0
19    2     1      0
20    2     1      1
21    2     1      0
22    2     1      0
23    2     1      1
24    2     1      1
25    2     1      1
26    2     1      0
27    2     1      1
运行2:
   col1  col2  col_3
0     1     0      1
1     1     0      0
2     1     0      1
3     1     0      1
4     1     0      1
5     1     0      0
6     1     0      1
7     1     0      0
8     1     0      0
9     1     0      0
10    1     0      1
11    1     0      0
12    1     0      0
13    1     0      1
14    2     1      1
15    2     1      0
16    2     1      0
17    2     1      1
18    2     1      1
19    2     1      1
20    2     1      1
21    2     1      0
22    2     1      1
23    2     1      0
24    2     1      0
25    2     1      1
26    2     1      0
27    2     1      0
英文:
I would use the df.sample() method to isolate all the subgroups randomly within the condition, and then assign the values
ds = {'col1' : [1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2],
      'col2' : [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]}
data = pd.DataFrame(data=ds)
data['col_3'] = 0
sets up the dataframe and creates a new, empty column
Now let's select by condition
cond_1 = data.loc[(data.col1==1)&(data.col2==0)]
cond_2 = data.loc[(data.col1==2)&(data.col2==1)]
Get a random 50 % of the first condition, and then get the remaining 50%
cond_1_A = cond_1.sample(frac=.5)
cond_1_B = cond_1.loc[cond_1.index.difference(cond_1_A.index)]
For each sub-group, set the value to 0 or 1
data.col_3.loc[cond_1_A.index] = 0
data.col_3.loc[cond_1_B.index] = 1
Second Condition - Same Thing
cond_2_A = cond_2.sample(frac=.5)
cond_2_B = cond_2.loc[cond_2.index.difference(cond_2_A.index)]
data.col_3.loc[cond_2_A.index] = 0
data.col_3.loc[cond_2_B.index] = 1
That should do it.
Run 1
data
	col1	col2	col_3
0	1	0	0
1	1	0	1
2	1	0	1
3	1	0	0
4	1	0	1
5	1	0	1
6	1	0	0
7	1	0	0
8	1	0	0
9	1	0	1
10	1	0	0
11	1	0	0
12	1	0	1
13	1	0	1
14	2	1	1
15	2	1	1
16	2	1	0
17	2	1	0
18	2	1	0
19	2	1	0
20	2	1	1
21	2	1	0
22	2	1	0
23	2	1	1
24	2	1	1
25	2	1	1
26	2	1	0
27	2	1	1
Run 2
data
	col1	col2	col_3
0	1	0	1
1	1	0	0
2	1	0	1
3	1	0	1
4	1	0	1
5	1	0	0
6	1	0	1
7	1	0	0
8	1	0	0
9	1	0	0
10	1	0	1
11	1	0	0
12	1	0	0
13	1	0	1
14	2	1	1
15	2	1	0
16	2	1	0
17	2	1	1
18	2	1	1
19	2	1	1
20	2	1	1
21	2	1	0
22	2	1	1
23	2	1	0
24	2	1	0
25	2	1	1
26	2	1	0
27	2	1	0
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。



评论