Loading multiple spreadsheets into R via a script

Monday. June 13, 2016 - 8 mins

I recently helped a colleague with writing a script to load a number of spreadsheets into R. He was using Excel spreadsheets to enter data on individual cases with one spreadsheet containing all the longitudinal data for an individual, i.e. multiple observations per individual in each spreadsheet. He then wanted to combine the data for all the individuals for analysis. It was tedious to do so by hand to combine all the data points into a merged spreadsheet, so we set out to write script to do it. This is what the spreadsheet looked like initially (data has been anonymised):

#Step 1. Load required libraries
library(readxl)

#Step 2. Find out the names of the spreadsheets you want to combine in the target folder
filenames <- sort(list.files(path = "data/", pattern="*.xlsx"))

#Step 3. Load all the variables into individual dataframes you name according to the number 
dfnames <- c(paste0("variables",1:length(filenames)))

for (i in 1:length(filenames)) {
        temp <- read_excel(paste0("data/",filenames[i]), sheet=1, skip=17, col_names=FALSE)[1:58,1:8]
        assign(dfnames[i], temp)
}

ls()

## [1] "big_dataframe" "dfnames"       "filenames"     "i"            
## [5] "rmd2md"        "temp"          "variables1"    "variables2"   
## [9] "variables3"

#Step 4. Repeat for demographics, treatment and time resolution
dfnames <- c(paste0("demographics",1:length(filenames)))
for (i in 1:length(filenames)) {
        temp <- read_excel(paste0("data/",filenames[i]), sheet=1, skip=2, col_names=FALSE)[1:12,1:2]
        assign(dfnames[i], temp)
}

dfnames <- c(paste0("treatments",1:length(filenames)))
for (i in 1:length(filenames)) {
        temp <- read_excel(paste0("data/",filenames[i]), sheet=1, skip = 77, col_names = FALSE)[1:36,]
        assign(dfnames[i], temp)
}

dfnames <- c(paste0("time_resolution",1:length(filenames)))
for (i in 1:length(filenames)) {
        temp <- read_excel(paste0("data/",filenames[i]), sheet = 1, skip = 115,col_names = FALSE)[1:5,1:4]
        assign(dfnames[i], temp)
}

ls()

##  [1] "big_dataframe"    "demographics1"    "demographics2"   
##  [4] "demographics3"    "dfnames"          "filenames"       
##  [7] "i"                "rmd2md"           "temp"            
## [10] "time_resolution1" "time_resolution2" "time_resolution3"
## [13] "treatments1"      "treatments2"      "treatments3"     
## [16] "variables1"       "variables2"       "variables3"

#Step 5. Set up an empty dataframe to accept the values you want to choose
big_dataframe <- data.frame(id ="", 
                            pt_initials ="", 
                            weight ="", 
                            age ="",
                            sex ="",
                            diabetes_type ="",
                            new_dx ="",
                            insulin ="",
                            dx_time ="",
                            itu_time ="",
                            dx_fio2 ="",
                            itu_fio2 ="",
                            dx_pao2 ="",
                            itu_pao2 ="",
                            itu_hartmanns ="",
                            stringsAsFactors = FALSE
                            )

big_dataframe

##   id pt_initials weight age sex diabetes_type new_dx insulin dx_time
## 1                                                                   
##   itu_time dx_fio2 itu_fio2 dx_pao2 itu_pao2 itu_hartmanns
## 1

#Step 6. Populate the cells of the dataframe with the data you want 
for (i in 1:length(filenames)) {
        big_dataframe[i,"id"] <- get(paste0("demographics", i))[1,2]
        big_dataframe[i,"pt_initials"] <- get(paste0("demographics", i))[2,2]
        big_dataframe[i,"weight"] <- get(paste0("demographics", i))[7,2]
        big_dataframe[i,"age"] <- get(paste0("demographics", i))[8,2]
        big_dataframe[i,"sex"] <- get(paste0("demographics", i))[9,2]
        big_dataframe[i,"diabetes_type"] <- get(paste0("demographics", i))[10,2]
        big_dataframe[i,"new_dx"] <- get(paste0("demographics", i))[11,2]
        big_dataframe[i,"insulin"] <- get(paste0("demographics", i))[12,2]
        big_dataframe[i, "dx_time"] <- get(paste0("variables", i))[2,2]
        big_dataframe[i, "itu_time"] <- get(paste0("variables", i))[2,3]
        big_dataframe[i, "dx_fio2"] <- get(paste0("variables", i))[6,2]
        big_dataframe[i, "itu_fio2"] <- get(paste0("variables", i))[6,3]
        big_dataframe[i, "dx_pao2"] <- get(paste0("variables", i))[7,2]
        big_dataframe[i, "itu_pao2"] <- get(paste0("variables", i))[7,3]
        big_dataframe[i, "itu_hartmanns"] <- get(paste0("treatments", i))[10,3]
}

summary(big_dataframe)

##       id            pt_initials           weight         
##  Length:3           Length:3           Length:3          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      age                sex            diabetes_type     
##  Length:3           Length:3           Length:3          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##     new_dx            insulin            dx_time         
##  Length:3           Length:3           Length:3          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##    itu_time           dx_fio2            itu_fio2        
##  Length:3           Length:3           Length:3          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##    dx_pao2            itu_pao2         itu_hartmanns     
##  Length:3           Length:3           Length:3          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

#Step 7. Clean up your now messy environment, leaving just the dataframe you're interested in
rm(list=setdiff(ls(), "big_dataframe"))

sessionInfo()

## R version 3.3.0 (2016-05-03)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows >= 8 x64 (build 9200)
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] readxl_0.1.0 knitr_1.11  
## 
## loaded via a namespace (and not attached):
## [1] magrittr_1.5  formatR_1.2.1 tools_3.3.0   Rcpp_0.12.2   stringi_1.0-1
## [6] stringr_1.0.0 evaluate_0.8

Danny Wong

Anaesthetist & Health Services Researcher

Loading multiple spreadsheets into R via a script

Related Posts

Danny Wong