将公式应用/创建到操作两个数据框。

huangapple go评论70阅读模式
英文:

apply/create formula to manipulate two dataframes

问题

我有两个在R中的数据框

df1
chr   start     end strand bam1 bam2 bam3 bam4 bam5 bam6 bam7 bam8
1 chr1 3531569 3531966      -    2    2    1    4    8   36   21    1
2 chr1 3670538 3672624      -  251   50  170  165  294  259  665   86
3 chr1 4491645 4493854      -  220   46  179  167  275  332  414   77
4 chr1 4496542 4497750      -  115   41  100   67  114   69   42   63
5 chr1 4571267 4572265      -   64   32   77   44   76  130  179   27
6 chr1 4688213 4688719      -   39   10   20   20   14   23   25   17
7 chr1 4688800 4688919      -   20   30   10   20   14   55   17   20
8 chr1 4688800 4688919      -    2    4    6    8   10   12   14   16
9 chr1 4688800 4688919      -    1    2    3    4    5    6    7    8

df2
bam_file r1 r2
1     bam1  2  1
2     bam2  9  3
3     bam3  1  4
4     bam4  1  5
5     bam5  1  1
6     bam6  8  6
7     bam7  3  7
8     bam8  3  2

我想应用以下公式(假设为X),使df2的列乘以df1的行

((df2[1,2]-df1[1,5])ˆ2 + (df2[2,2]-df1[1,6])ˆ2 + (df2[3,2]-df1[1,7])ˆ2 + (df2[4,2]-df1[1,8])ˆ2 + (df2[5,2]-df1[1,9])ˆ2 + (df2[6,2]-df1[1,10])ˆ2 + (df2[7,2]-df1[1,11])ˆ2 + 
 (df2[8,2]-df1[1,12])ˆ2)/(ncol(df1)-4)

所以期望的输出将是

output
        r1        r2
1  152.375    144.75
2 89140.25 88467.875
3 57822.75 57413.125
4 6195.125      6148
5 8007.375   7858.75
6   395.75   372.625
7   508.75   543.125
8    60.75    47.125
9     15.5     6.875

如果您需要任何进一步的帮助,请随时提问。

英文:

I have two dataframes in R

df1
chr   start     end strand bam1 bam2 bam3 bam4 bam5 bam6 bam7 bam8
1 chr1 3531569 3531966      -    2    2    1    4    8   36   21    1
2 chr1 3670538 3672624      -  251   50  170  165  294  259  665   86
3 chr1 4491645 4493854      -  220   46  179  167  275  332  414   77
4 chr1 4496542 4497750      -  115   41  100   67  114   69   42   63
5 chr1 4571267 4572265      -   64   32   77   44   76  130  179   27
6 chr1 4688213 4688719      -   39   10   20   20   14   23   25   17
7 chr1 4688800 4688919      -   20   30   10   20   14   55   17   20
8 chr1 4688800 4688919      -    2    4    6    8   10   12   14   16
9 chr1 4688800 4688919      -    1    2    3    4    5    6    7    8

and

df2
bam_file r1 r2
1     bam1  2  1
2     bam2  9  3
3     bam3  1  4
4     bam4  1  5
5     bam5  1  1
6     bam6  8  6
7     bam7  3  7
8     bam8  3  2

I want to apply following formula (let say X), So that column of df2 X row of df1

((df2[1,2]-df1[1,5])ˆ2 + (df2[2,2]-df1[1,6])ˆ2 + (df2[3,2]-df1[1,7])ˆ2 + (df2[4,2]-df1[1,8])ˆ2 + (df2[5,2]-df1[1,9])ˆ2 + (df2[6,2]-df1[1,10])ˆ2 + (df2[7,2]-df1[1,11])ˆ2 + 
 (df2[8,2]-df1[1,12])ˆ2)/(ncol(df1)-4)

So the desired output will be

output
        r1        r2
1  152.375    144.75
2 89140.25 88467.875
3 57822.75 57413.125
4 6195.125      6148
5 8007.375   7858.75
6   395.75   372.625
7   508.75   543.125
8    60.75    47.125
9     15.5     6.875

I apologize if this appears to be a repetitive question, but I tried and was unable to resolve it (as I am beginner and learning). It would be great to find a solution. Thank you in advance and looking for a positive response.

答案1

得分: 1

以下是您要翻译的内容:

We could create a sequence column ('rn'), reshape to 'long' format with pivot_longer on the first data, join with the second data ('df2') and do a group by calculation on the 'r1', 'r2' columns in reframe

library(dplyr) # version >= 1.1.0
library(tidyr)
df1 %>%
  mutate(rn = row_number()) %>%
  pivot_longer(cols= starts_with("bam"), names_to = "bam_file") %>%
  left_join(df2) %>%
  reframe(across(r1:r2, ~ sum((value - .x)^2)/n()), .by = "rn")

-output

# A tibble: 9 × 3
     rn      r1       r2
  <int>   <dbl>    <dbl>
1     1   152.    145.  
2     2 89140.  88468.  
3     3 57823.  57413.  
4     4  6195.   6148   
5     5  8007.   7859.  
6     6   396.    373.  
7     7   509.    543.  
8     8    60.8    47.1 
9     9    15.5     6.88

data

df1 <- structure(list(chr = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1"), start = c(3531569L, 3670538L,
4491645L, 4496542L, 4571267L, 4688213L, 4688800L, 4688800L, 4688800L
), end = c(3531966L, 3672624L, 4493854L, 4497750L, 4572265L,
4688719L, 4688919L, 4688919L, 4688919L), strand = c("-", "-",
"-", "-", "-", "-", "-", "-", "-"), bam1 = c(2L, 251L, 220L,
115L, 64L, 39L, 20L, 2L, 1L), bam2 = c(2L, 50L, 46L, 41L, 32L,
10L, 30L, 4L, 2L), bam3 = c(1L, 170L, 179L, 100L, 77L, 20L, 10L,
6L, 3L), bam4 = c(4L, 165L, 167L, 67L, 44L, 20L, 20L, 8L, 4L),
    bam5 = c(8L, 294L, 275L, 114L, 76L, 14L, 14L, 10L, 5L), bam6 = c(36L,
    259L, 332L, 69L, 130L, 23L, 55L, 12L, 6L), bam7 = c(21L,
    665L, 414L, 42L, 179L, 25L, 17L, 14L, 7L), bam8 = c(1L, 86L,
    77L, 63L, 27L, 17L, 20L, 16L, 8L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))

df2 <- structure(list(bam_file = c("bam1", "bam2", "bam3", "bam4", "bam5",
"bam6", "bam7", "bam8"), r1 = c(2L, 9L, 1L, 1L, 1L, 8L, 3L, 3L
), r2 = c(1L, 3L, 4L, 5L, 1L, 6L, 7L, 2L)), class = "data.frame", row names = c("1",
"2", "3", "4", "5", "6", "7", "8"))

注意:我已将代码部分保留在英文状态,不做翻译。

英文:

We could create a sequence column ('rn'), reshape to 'long' format with pivot_longer on the first data, join with the second data ('df2') and do a group by calculation on the 'r1', 'r2' columns in reframe

library(dplyr) # version &gt;= 1.1.0
library(tidyr)
df1 %&gt;%
mutate(rn = row_number()) %&gt;%  
pivot_longer(cols= starts_with(&quot;bam&quot;), names_to = &quot;bam_file&quot;) %&gt;% 
left_join(df2) %&gt;%  
reframe(across(r1:r2, ~ sum((value - .x)^2)/n()), .by = &quot;rn&quot;)

-output

# A tibble: 9 &#215; 3
rn      r1       r2
&lt;int&gt;   &lt;dbl&gt;    &lt;dbl&gt;
1     1   152.    145.  
2     2 89140.  88468.  
3     3 57823.  57413.  
4     4  6195.   6148   
5     5  8007.   7859.  
6     6   396.    373.  
7     7   509.    543.  
8     8    60.8    47.1 
9     9    15.5     6.88

data

df1 &lt;- structure(list(chr = c(&quot;chr1&quot;, &quot;chr1&quot;, &quot;chr1&quot;, &quot;chr1&quot;, &quot;chr1&quot;, 
&quot;chr1&quot;, &quot;chr1&quot;, &quot;chr1&quot;, &quot;chr1&quot;), start = c(3531569L, 3670538L, 
4491645L, 4496542L, 4571267L, 4688213L, 4688800L, 4688800L, 4688800L
), end = c(3531966L, 3672624L, 4493854L, 4497750L, 4572265L, 
4688719L, 4688919L, 4688919L, 4688919L), strand = c(&quot;-&quot;, &quot;-&quot;, 
&quot;-&quot;, &quot;-&quot;, &quot;-&quot;, &quot;-&quot;, &quot;-&quot;, &quot;-&quot;, &quot;-&quot;), bam1 = c(2L, 251L, 220L, 
115L, 64L, 39L, 20L, 2L, 1L), bam2 = c(2L, 50L, 46L, 41L, 32L, 
10L, 30L, 4L, 2L), bam3 = c(1L, 170L, 179L, 100L, 77L, 20L, 10L, 
6L, 3L), bam4 = c(4L, 165L, 167L, 67L, 44L, 20L, 20L, 8L, 4L), 
bam5 = c(8L, 294L, 275L, 114L, 76L, 14L, 14L, 10L, 5L), bam6 = c(36L, 
259L, 332L, 69L, 130L, 23L, 55L, 12L, 6L), bam7 = c(21L, 
665L, 414L, 42L, 179L, 25L, 17L, 14L, 7L), bam8 = c(1L, 86L, 
77L, 63L, 27L, 17L, 20L, 16L, 8L)), class = &quot;data.frame&quot;, row.names = c(&quot;1&quot;, 
&quot;2&quot;, &quot;3&quot;, &quot;4&quot;, &quot;5&quot;, &quot;6&quot;, &quot;7&quot;, &quot;8&quot;, &quot;9&quot;))
df2 &lt;- structure(list(bam_file = c(&quot;bam1&quot;, &quot;bam2&quot;, &quot;bam3&quot;, &quot;bam4&quot;, &quot;bam5&quot;, 
&quot;bam6&quot;, &quot;bam7&quot;, &quot;bam8&quot;), r1 = c(2L, 9L, 1L, 1L, 1L, 8L, 3L, 3L
), r2 = c(1L, 3L, 4L, 5L, 1L, 6L, 7L, 2L)), class = &quot;data.frame&quot;, row.names = c(&quot;1&quot;, 
&quot;2&quot;, &quot;3&quot;, &quot;4&quot;, &quot;5&quot;, &quot;6&quot;, &quot;7&quot;, &quot;8&quot;))

huangapple
  • 本文由 发表于 2023年2月18日 03:40:52
  • 转载请务必保留本文链接:https://go.coder-hub.com/75488606.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定