2023年6月6日 09:09:07go评论98阅读模式

英文:

How to be concise in R code: 1) Reading lists of *.csv files -> 2) rename lists -> 3) merge -> 4) get follow up rates

问题

以下是一个工作中的代码，其中

我从9个不同的文件夹中读取csv文件列表
将每个文件夹转化为一个列表
重命名列表的元素（去掉'.csv'和其他字符）
在每个文件夹内连接以创建一个大的数据框
合并所有9个数据框以创建一个数据框
获取后续的RID和失访RID以及它们的比率。

从as1到as3，有AS{n}_AREA列，从as4到as9有AS{n}_DATA_CLASS列。对于MRE，

我大部分使用了这里提供的想法，但当我将它们添加在一起时，代码看起来有些多余，可能可以做更多以使其对其他人更整洁。

有什么建议吗？谢谢！

library(dplyr); library(plyr)
library(magrittr); library(stringr) 
library(ExclusionTable)
library(lubridate)
library(tidyverse); library(tidyr)
library(janitor)
library(survival)
library(ggsurvfit); library(gtsummary)
library(zoo)
library(tidycmprsk)
# AA cohort (2 of 3)
## as
i=1
num_fu = c(1,2,3,4,5,6,7,8,9)
as <- data.frame()
df <- data.frame()
dfs <- data.frame()
data_dir <- 'C:/Users/thepr/Documents/data/as'
assign(paste0("flnames", i), list.files(path = paste0(data_dir, i), pattern = "\\.csv", full.names = TRUE))
assign(paste0("as", i, "_list"), lapply(get(paste0("flnames", i)),
                                        function(x){base::as.data.frame(read.csv(x))}))
nm <- gsub(".csv", "", basename(eval(parse(text = paste0("flnames", i))))) %>% str_sub(., 1,6)
assign(paste0("as", i, "_list"), setNames(get(paste0("as", i, "_list")), nm))
df <- Reduce(full_join, get(paste0("as", i, "_list")))
assign(paste0("as",i), df[!duplicated(base::as.list(df))])
dfs <- df
for (i in 2:length(num_fu)){
RID_common <- as1$RID %in% get(paste0("as", i))$RID
      assign(paste0("flnames", i), list.files(path = paste0(data_dir, i), pattern = "\\.csv", full.names = TRUE))
      assign(paste0("as", i, "_list"), lapply(get(paste0("flnames", i)),
                                              function(x){base::as.data.frame(read.csv(x))}))
      nm <- gsub(".csv", "", basename(eval(parse(text = paste0("flnames", i))))) %>% str_sub(., 1,6)
      assign(paste0("as", i, "_list"), setNames(get(paste0("as", i, "_list")), nm))
      df <- Reduce(full_join, get(paste0("as", i, "_list")))
      assign(paste0("as",i), df[!duplicated(base::as.list(df)))])
      
      dfs <- merge(dfs, df, by = "RID", all.x = TRUE)
      dfs <- dfs[!duplicated(base::as.list(dfs)))]
            if(paste0("AS", i, "_AREA") %in% colnames(get(paste0("as", i)))){
              assign(paste0("fu_",i-1), get(paste0("as", i))[RID_common, c("RID", paste0("AS", i, "_AREA"))])
              assign(paste0("fu_loss_",i-1), get(paste0("as", i))[!RID_common, c("RID", paste0("AS", i, "_AREA"))])
            # FU rate
              assign(paste0("fu_rate_", i-1), nrow(get(paste0("as", i)))/nrow(as1))
            }
            else if(paste0("AS", i, "_DATA_CLASS") %in% colnames(get(paste0("as", i)))){
              assign(paste0("fu_",i-1), get(paste0("as", i))[RID_common, c("RID", paste0("AS", i, "_DATA_CLASS"))])
              assign(paste0("fu_loss_",i-1), get(paste0("as", i))[!RID_common, c("RID", paste0("AS", i, "_DATA_CLASS"))])
            # FU rate
              assign(paste0("fu_rate_", i-1), nrow(get(paste0("as", i)))/nrow(as1))
            }
            else{}
}

英文:

Below is a working code, where

I read lists of csv files from 9 different folders

2. Make each folder a list.

Rename elements of a list (drop '.csv' and others).
Join within each folder to make 1 big dataframe.
Merge all 9 dataframes to make 1 dataframe.
Get follow up RIDs and loss-to-follow-up RIDs and their rates.

From as1 to as3, there are AS{n}_AREA columns and from as4 to as9 there are AS{n}_DATA_CLASS columns For MRE,

I have used ideas given here mostly, but when I add them up, the code somehow looks redundant and more could be done to look tidy to others.

Any ideas please? Thank you!

library(dplyr); library(plyr)
library(magrittr); library(stringr) 
library(ExclusionTable)
library(lubridate)
library(tidyverse); library(tidyr)
library(janitor)
library(survival)
library(ggsurvfit); library(gtsummary)
library(zoo)
library(tidycmprsk)
# AA cohort (2 of 3)
## as
i=1
num_fu = c(1,2,3,4,5,6,7,8,9)
as &lt;- data.frame()
df &lt;- data.frame()
dfs &lt;- data.frame()
data_dir &lt;- &#39;C:/Users/thepr/Documents/data/as&#39;
assign(paste0(&quot;flnames&quot;, i), list.files(path = paste0(data_dir, i), pattern = &quot;\\.csv&quot;, full.names = TRUE))
assign(paste0(&quot;as&quot;, i, &quot;_list&quot;), lapply(get(paste0(&quot;flnames&quot;, i)),
function(x){base::as.data.frame(read.csv(x))}))
nm &lt;- gsub(&quot;.csv&quot;, &quot;&quot;, basename(eval(parse(text = paste0(&quot;flnames&quot;, i))))) %&gt;% str_sub(., 1,6)
assign(paste0(&quot;as&quot;, i, &quot;_list&quot;), setNames(get(paste0(&quot;as&quot;, i, &quot;_list&quot;)), nm))
df &lt;- Reduce(full_join, get(paste0(&quot;as&quot;, i, &quot;_list&quot;)))
assign(paste0(&quot;as&quot;,i), df[!duplicated(base::as.list(df))])
dfs &lt;- df
for (i in 2:length(num_fu)){
RID_common &lt;- as1$RID %in% get(paste0(&quot;as&quot;, i))$RID
assign(paste0(&quot;flnames&quot;, i), list.files(path = paste0(data_dir, i), pattern = &quot;\\.csv&quot;, full.names = TRUE))
assign(paste0(&quot;as&quot;, i, &quot;_list&quot;), lapply(get(paste0(&quot;flnames&quot;, i)),
function(x){base::as.data.frame(read.csv(x))}))
nm &lt;- gsub(&quot;.csv&quot;, &quot;&quot;, basename(eval(parse(text = paste0(&quot;flnames&quot;, i))))) %&gt;% str_sub(., 1,6)
assign(paste0(&quot;as&quot;, i, &quot;_list&quot;), setNames(get(paste0(&quot;as&quot;, i, &quot;_list&quot;)), nm))
df &lt;- Reduce(full_join, get(paste0(&quot;as&quot;, i, &quot;_list&quot;)))
assign(paste0(&quot;as&quot;,i), df[!duplicated(base::as.list(df))])
dfs &lt;- merge(dfs, df, by = &quot;RID&quot;, all.x = TRUE)
dfs &lt;- dfs[!duplicated(base::as.list(dfs))]
if(paste0(&quot;AS&quot;, i, &quot;_AREA&quot;) %in% colnames(get(paste0(&quot;as&quot;, i)))){
assign(paste0(&quot;fu_&quot;,i-1), get(paste0(&quot;as&quot;, i))[RID_common, c(&quot;RID&quot;, paste0(&quot;AS&quot;, i, &quot;_AREA&quot;))])
assign(paste0(&quot;fu_loss_&quot;,i-1), get(paste0(&quot;as&quot;, i))[!RID_common, c(&quot;RID&quot;, paste0(&quot;AS&quot;, i, &quot;_AREA&quot;))])
# FU rate
assign(paste0(&quot;fu_rate_&quot;, i-1), nrow(get(paste0(&quot;as&quot;, i)))/nrow(as1))
}
else if(paste0(&quot;AS&quot;, i, &quot;_DATA_CLASS&quot;) %in% colnames(get(paste0(&quot;as&quot;, i)))){
assign(paste0(&quot;fu_&quot;,i-1), get(paste0(&quot;as&quot;, i))[RID_common, c(&quot;RID&quot;, paste0(&quot;AS&quot;, i, &quot;_DATA_CLASS&quot;))])
assign(paste0(&quot;fu_loss_&quot;,i-1), get(paste0(&quot;as&quot;, i))[!RID_common, c(&quot;RID&quot;, paste0(&quot;AS&quot;, i, &quot;_DATA_CLASS&quot;))])
# FU rate
assign(paste0(&quot;fu_rate_&quot;, i-1), nrow(get(paste0(&quot;as&quot;, i)))/nrow(as1))
}
else{}
}

答案1

得分: 1

Sure, here's the translated code:

使用 Sys.glob 和 abbreviate。这会生成一个扁平列表，而不是列表的列表。不使用任何包。

data_dir <- "C:/Users/thepr/Documents/data"
pat <- file.path(data_dir, "as[1-9]", "*.csv")
files <- Sys.glob(pat)
L <- Map(read.csv, files)
names(L) <- abbreviate(basename(names(L)), 6)

或者，也可以使用这些名称替代最后一行：

names(L) <- paste(basename(dirname(files)), basename(files), sep = ".") %>%
  abbreviate(6)

英文:

Use Sys.glob and abbreviate. This gives a flat list rather than a list of lists. No packages are used.

data_dir &lt;- &quot;C:/Users/thepr/Documents/data&quot;
pat &lt;- file.path(data_dir, &quot;as[1-9]&quot;, &quot;*.csv&quot;)
files &lt;- Sys.glob(pat)
L &lt;- Map(read.csv, files)
names(L) &lt;- abbreviate(basename(names(L)), 6)

or maybe use these names instead of the last line:

names(L) &lt;- paste(basename(dirname(files)), basename(files), sep = &quot;.&quot;) |&gt;
abbreviate(6)

答案2

得分: 0

Thanks to @Gregor Thomas, library could be tidied as follows:

library(tidyverse) #包括: dplyr, stringr, tidyr, purrr
library(magrittr)
library(lubridate)
library(ExclusionTable)
library(zoo) #as.Date 函数
library(janitor) #用于回归分析
library(survival) #用于回归分析
library(ggsurvfit) #用于回归分析
library(gtsummary) #用于回归分析 
library(tidycmprsk) #用于回归分析

Thanks to @joran 和 @moodymudskipper，不再使用for循环、eval(parse(text = ...))) 或 get(paste0( ...))，而是使用向量和lapply。

这里的关键是使用了嵌套列表。我有一个文件夹列表，每个文件夹包含8个或更多csv文件。

data_dir <- "C:/Users/thepr/Documents/data/as"
num_fu <- 1:9
dirs <- paste0(data_dir, num_fu)
as_list <- lapply(dirs, function(x) {
  files <- list.files(x, pattern = "\\.csv$", full.names = TRUE)
  names(files) <- str_sub(basename(files), 1, 6)
  Reduce(full_join, lapply(files, read.csv))
})

关于后续和损失率，我将很快发布简洁的答案！

英文:

Thanks to @Gregor Thomas, library could be tidied as follows:

library(tidyverse) #Includes: dplyr, stringr, tidyr, purrr
library(magrittr)
library(lubridate)
library(ExclusionTable)
library(zoo) #as.Date function
library(janitor) #For Regression analysis
library(survival) #For Regression analysis
library(ggsurvfit) #For Regression analysis
library(gtsummary) #For Regression analysis 
library(tidycmprsk) #For Regression analysis

Thanks to @joran and @moodymudskipper, instead of using for loops and eval(parse(text = ...))) or get(paste0( ...)), I have used vectors and lapply.

The thing here is the use of nested lists. I have a list of folders where each folder contains 8+ csv files.

data_dir &lt;- &quot;C:/Users/thepr/Documents/data/as&quot;
num_fu &lt;- 1 : 9
dirs &lt;- paste0(data_dir, num_fu)
as_list &lt;- lapply(dirs, function(x) {
files &lt;- list.files(x, pattern = &quot;\\.csv$&quot;, full.names = TRUE)
names(files) &lt;- str_sub(basename(files), 1, 6)
Reduce(full_join, lapply(files, read.csv))
})

As per follow-up and loss rate, I will post concise answers soon!

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

How to be concise in R code: 1) Reading lists of *.csv files -> 2) rename lists -> 3) merge -> 4) get follow up rates

问题

答案1

答案2

使用purrr在多个列上进行多个映射的重新编码。

基于三个条件筛选 ID，大型数据框。

Error in `[[<-`(`tmp`, i, value = sub("\\_.*", "", i)) : attempt to select more than one element in vectorIndex

在R中沿多个轴绘制列。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。