比较两个 defaultdict(list) 使用逻辑条件

huangapple go评论65阅读模式
英文:

compare two defaultdict(list) with logical conditions

问题

Here is the translation of the code you provided:

from collections import defaultdict

# Sample data
ids = defaultdict(list)
V1 = defaultdict(list)

# Your code here
def getValueOf(k, L):
    for i, v in enumerate(d[k] for d in L):
        return i, v

for key in ids.keys() & V1.keys():
    if getValueOf('ref', ids[key]) == getValueOf('base', ids[key]):
        ref_count = getValueOf('count', ids[key])
        ref_depth = getValueOf('depth', ids[key])
    if getValueOf('var', V1[key]) == getValueOf('base', ids[key]):
        var_count = getValueOf('count', ids[key])

If you have any specific questions or need further assistance, please let me know.

英文:

two defaultdict(list)

ids

3:42259955 [{'chr': '3', 'ref': 'G', 'depth': '224', 'base': 'A', 'count': '1', 'positive_strand': '0', 'negative_strand': '1', 'percent_bias': 0.0, 'vaf': 0.0, 'mutation': 'snv', 'group': '5555', 'timepoint': 'D0', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '224', 'base': 'C', 'count': '0', 'positive_strand': '0', 'negative_strand': '0', 'percent_bias': '0', 'vaf': '0', 'mutation': 'snv', 'group': '5555', 'timepoint': 'D0', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '224', 'base': 'G', 'count': '223', 'positive_strand': '121', 'negative_strand': '102', 'percent_bias': 0.54, 'vaf': 1.0, 'mutation': 'no-mutation', 'group': '5555', 'timepoint': 'D0', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '224', 'base': 'T', 'count': '0', 'positive_strand': '0', 'negative_strand': '0', 'percent_bias': '0', 'vaf': '0', 'mutation': 'snv', 'group': '5555', 'timepoint': 'D0', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '224', 'base': 'N', 'count': '0', 'positive_strand': '0', 'negative_strand': '0', 'percent_bias': '0', 'vaf': '0', 'mutation': 'snv', 'group': '5555', 'timepoint': 'D0', 'st': '42259955'}]

V1

3:42259955 [{'group': '5555', 'timepoint': 'D0', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}, {'group': '5555', 'timepoint': 'C1', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}, {'group': '5555', 'timepoint': 'C3', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}, {'group': '5555', 'timepoint': 'C4', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}]

What i intend to do is

compare two default dict lists
first check is key matches
check if the ref and base are same in ids if yes store the depth info this will be constant
which is this entry
{'chr': '3', 'ref': 'G', 'depth': '224', 'base': 'G', 'count': '223', 'positive_strand': '121', 'negative_strand': '102', 'percent_bias': 0.54, 'vaf': 1.0, 'mutation': 'no-mutation', 'group': '5555', 'timepoint': 'D0', 'st': '42259955'}
check for base in ids == var(in this case 'C') in V1 if yes then get the count( which is 0), from ids
check for timepoints, if a time point is not in ids but in variant get the timepoint info and fill in other info from ids

Desired output

position	timepoint chr	st	depth	count	base	positive_strand	negative_strand	percent_bias	vaf
3:42259955 D0	3	42259955	224	0	C	0	0	0	0
3:42259955 C1	3	42259955	224	0	C	0	0	0	0
3:42259955 C3	3	42259955	224	0	C	0	0	0	0
3:42259955 C4	3	42259955	224	0	C	0	0	0	0

What i have so far

def getValueOf(k, L):
		#print(L)
		print(len(L))
		for i, v in enumerate(d[k] for d in L):
			return i,v
for key in ids.keys() & V1.keys():
    ## first cond compare within each list 
    if getValueOf('ref', ids[key]) == getValueOf('base', ids[key]):
	   ref_count = getValueOf('count', ids[key])
	   ref_depth  = getValueOf('depth', ids[key])
    ## secon cond compare between two deafultdicts
    if getValueOf('var', V1[key]) == getValueOf('base', ids[key]):
		var_count = getValueOf('count', ids[key])

Is there a elegant way to do this than this, should i use a defaultdict in the first place or a nested dictionary should work

Update

V1

3:42259955 [{'group': '555', 'timepoint': 'D0', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}, {'group': '555', 'timepoint': 'C1', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}, {'group': '555', 'timepoint': 'C3', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}, {'group': '555', 'timepoint': 'C4', 'chrm': '3', 'st': '42259955', 'en': '42259956', 'var': 'C'}]

ids

3:42259955 [{'chr': '3', 'ref': 'G', 'depth': '141', 'base': 'A', 'count': '1', 'positive_strand': '0', 'negative_strand': '1', 'percent_bias': 0.0, 'vaf': 0.01, 'mutation': 'snv', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '141', 'base': 'C', 'count': '4', 'positive_strand': '0', 'negative_strand': '4', 'percent_bias': 0.0, 'vaf': 0.03, 'mutation': 'snv', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '141', 'base': 'G', 'count': '135', 'positive_strand': '99', 'negative_strand': '36', 'percent_bias': 0.73, 'vaf': 0.96, 'mutation': 'no-mutation', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '141', 'base': 'T', 'count': '1', 'positive_strand': '0', 'negative_strand': '1', 'percent_bias': 0.0, 'vaf': 0.01, 'mutation': 'snv', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '141', 'base': 'N', 'count': '0', 'positive_strand': '0', 'negative_strand': '0', 'percent_bias': '0', 'vaf': '0', 'mutation': 'snv', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '141', 'base': '+A', 'count': '1', 'positive_strand': '0', 'negative_strand': '1', 'percent_bias': 0.0, 'vaf': 0.01, 'mutation': 'ins', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '141', 'base': '+C', 'count': '13', 'positive_strand': '0', 'negative_strand': '13', 'percent_bias': 0.0, 'vaf': 0.09, 'mutation': 'ins', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}, {'chr': '3', 'ref': 'G', 'depth': '141', 'base': '+T', 'count': '11', 'positive_strand': '0', 'negative_strand': '11', 'percent_bias': 0.0, 'vaf': 0.08, 'mutation': 'ins', 'group': '555', 'timepoint': 'C4', 'st': '42259955'}]

from code

     position  timepoint chr ref        st depth count base positive_strand negative_strand  percent_bias   vaf
0   3:42259955      D0   3   G  42259955   141     4    C               0               4           0.0  0.03
1   3:42259955      C1   3   G  42259955   141     4    C               0               4           0.0  0.03
2   3:42259955      C3   3   G  42259955   141     4    C               0               4           0.0  0.03
3   3:42259955	C4   3   G  42259955   141     4    C               0               4           0.0  0.03

desired output

    position  timepoint chr ref        st depth count base positive_strand negative_strand  percent_bias   vaf
0   3:42259955      D0   3   G  42259955   141     0    C               0               0          0.0  0.00
1   3:42259955      C1   3   G  42259955   141     0    C               0               0           0.0  0.00
2   3:42259955      C3   3   G  42259955   141     0    C               0               0           0.0  0.00
3   3:42259955	C4   3   G  42259955   141     4    C               0               4           0.0  0.03

答案1

得分: 2

以下是您要翻译的内容:

解决新问题的更新:

这应该是一个可行的解决方案。然而,现在有这么多条件和细节,我怀疑我们最好是使用pandas创建一些表格,并进行一些连接和聚合查询,以提高代码的效率和简洁性,而不是学习如何使用for循环来迭代嵌套字典。

def comb_dicts(ids, v1):
    fields = [
        'position', 'timepoint', 'chr', 
        'st', 'depth', 'count', 'base', 
        'positive_strand', 'negative_strand', 
        'percent_bias', 'vaf'
    ]
    def_cols = {
        'count': 0, 'positive_strand': 0, 
        'negative_strand': 0, 'percent_bias': 0.0, 'vaf': 0.0
    }
    # 为输出行创建一个列表
    rows = []
    # 遍历共享键
    for k in ids.keys() & v1.keys():
        # 用于新变量字典的空列表
        var_ds = []
        # 遍历V1中的字典
        for d in v1[k]:
            # 在ids列表中找到与时间点匹配的任何匹配字典
            # 使用**展开以创建新字典-不要更新,因为那会更改原始字典
            # 注意v和d的顺序,这确保了在两者中都使用来自V1字典的值
            # 这在后面很重要
            var_ds = [
                {**v, **d, 'position': k} for v in ids[k] 
                if (
                    v['base'] != v['ref'] and 
                    d['var'] == v['base'] and 
                    d['timepoint'] == v['timepoint']
                    )
            ]
            # 如果在ids中没有找到具有匹配时间点的内容,则查找没有的内容
            # 这是v和d顺序重要的地方。这种情况下可以导致一些字典实际上是相同的
            # 我们将需要在某个时候对其进行去重处理-可以稍后使用pandas完成
            # 通过最后展开def_cols,我们可以覆盖不希望从ids复制的列
            if not var_ds:
                var_ds = [
                    {**v, **d, 'position': k, **def_cols} for v in ids[k] 
                    if (
                        v['base'] != v['ref'] and 
                        d['var'] == v['base']
                        )
                ]
            rows.extend(var_ds)
    return rows


my_rows = comb_dicts(ids, V1)
df = pd.DataFrame.from_records(my_rows)
df.drop_duplicates(inplace=True)
df[fields]

# 如果您想要去重的行作为字典列表,请执行以下操作
uniq_rows = df.to_dict('records')

请注意,我只翻译了代码中的注释和字符串,其他部分没有翻译。如果需要进一步的帮助,请告诉我。

英文:

Ok, so I'm still not sure I've got your requirement down 100%. And it's certainly hard to know what oddities might crop up in a larger dataset, and also how inefficient this could become at scale. But I think I have solved your problem.

UPDATED TO SOLVE THE NEW PROBLEM:

This should be a viable solution. However at this point there are so many conditions and wrinkles, that I suspect we may be better off creating some tables using pandas and performing some joining and aggregating queries in terms of efficiency and simplicity of code, rather than learning how to use for loops to iterate over nested dicts.

def comb_dicts(ids, v1):
fields = [
'position', 'timepoint', 'chr', 
'st', 'depth', 'count', 'base', 
'positive_strand', 'negative_strand', 
'percent_bias', 'vaf'
]
def_cols = {
'count': 0, 'positive_strand': 0, 
'negative_strand': 0, 'percent_bias': 0.0, 'vaf': 0.0
}
# Make a list for our output rows
rows = []
# Iterate through shared keys
for k in ids.keys() & v1.keys():
# Empty list for our new var dicts 
var_ds = []
# Loop through the dicts in V1
for d in v1[k]:
# Find any matching dicts in the ids list - where the timepoints match
# Use ** unpacking to create new dicts - don't update because that will alter the originals
# Note the order of v and d, this ensures that any keys in both use the value from the V1 dict
# This is important later
var_ds = [
{**v, **d, 'position': k} for v in ids[k] 
if (
v['base'] != v['ref'] and 
d['var'] == v['base'] and 
d['timepoint'] == v['timepoint']
)
]
# If we didn't find any with matching timepoints in ids then look for ones without
# This is where the order of v and d is important. We will keep the V1 timepoint this way
# Since this case can result in a list of dicts where some could actually be identical
# we will need to de-dup it at some point - can do this later with pandas
# By unpacking def_cols last we can overwrite columns that we don't want copied from ids
if not var_ds:
var_ds = [
{**v, **d, 'position': k, **def_cols} for v in ids[k] 
if (
v['base'] != v['ref'] and 
d['var'] == v['base']
)
]
rows.extend(var_ds)
return rows
my_rows = comb_dicts(ids, V1)
df = pd.DataFrame.from_records(my_rows)
df.drop_duplicates(inplace=True)
df[fields]
# If you want the de-duped rows as a list of dicts then do
uniq_rows = df.to_dict('records')

huangapple
  • 本文由 发表于 2020年1月6日 22:37:26
  • 转载请务必保留本文链接:https://go.coder-hub.com/59613955.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定