### ================================================================================= ### DATA WRANGLING ### ================================================================================= # Toni Monleon Getino - Section of Statistics. University of Barcelona #Los ejercicios y ejemplos proporcionados en este apartado se basan en materiales propios y de acceso libre, y su único propósito es el uso docente. #Se ha hecho un esfuerzo para garantizar la precisión y confiabilidad de la información, sin embargo, es importante tener en cuenta que la información puede cambiar con el tiempo. #Se recomienda a los usuarios que consulten fuentes adicionales y verifiquen la información por sí mismos antes de tomar cualquier decisión o acción basada en la información proporcionada aquí. # =========Description: # A recent article from the New York Times said "Data scientists, according to interviews and expert estimates, spend from 50 percent to 80 percent of their time mired in the mundane labor of collecting and preparing data, before it can be explored for useful information." # =========Principia: # Data you find “in the wild” will rarely be in a format necessary for analysis, and you will need to manipulate it before exploring the questions you are interested in. This may take more time than doing the analysis itself! I # According to Wickham, data is “tidy” when it meets three key criteria: # =========Each observation is in a row. # =========Each variable is in a column. # =========Each value has its own cell. # Being observent of these criteria allows us to recognize when data is organized or unorganized. It also provides us a standardized schema and set of tools for cleaning up some of the most common ways that datasets are “messy:” # =========Column headers are values, not variable names. # =========Multiple variables are stored in one column. # =========Variables are stored in both rows and columns. # =========Multiple types of observational units are stored in the same table. # =========A single observational unit is stored in multiple tables. # Before an R program can look for answers, your data must be cleaned up and converted to a form that makes information accessible. In this webinar, you will learn how to use the `dplyr` and `tidyr` packages to optimise the data wrangling process. You'll learn to: # Spot the variables and observations within your data # Quickly derive new variables and observations to explore # Reshape your data into the layout that works best for R # Join multiple data sets together # Use group-wise summaries to explore hidden levels of information within your data # TUTORIAL (MAKE THIS TUTORIAL AS A HOME WORK!!!!): http://jules32.github.io/2016-07-12-Oxford/dplyr_tidyr/ # SEE A TUTORIAL IN: https://r4ds.had.co.nz/ (WRANGLING) #more in: https://tereom.github.io/est-computacional-2019/r-primeros-pasos.html # What libraries do we mainly need to do these tasks? # Install tidyverse libraries and load it (librerias y paquetes para hacer data-wrangling) #pkgs <- c('reshape2', 'plyr', 'ggplot2', 'dplyr', 'data.table', 'Lahman') #install.packages(pkgs) # Do not worry if this takes a while...BUT...BETTER INSTALL tidyverse!!!!!!!!!!! #pero tidyverse hace todo otro paquete Importante los "Data wranglers": https://www.tidyverse.org/packages/ #The tidyverse is an opinionated collection of R packages designed for data science. #All packages share an underlying design philosophy, grammar, and data structures. #Install the complete tidyverse with: #install.packages("tidyverse") #https://www.tidyverse.org/packages/ if (!require("tidyverse")) install.packages("tidyverse") library(tidyverse) #More exercises in: https://b-rodrigues.github.io/modern_R/descriptive-statistics-and-data-manipulation.html#the-tidyverses-enfant-prodige-dplyr #and https://rpubs.com/StefanoSanfilippo/654777 library(dplyr) #transform library(tidyr) #tidy #SOME EXERCISES #The databases to be used are already preloaded in RStudio: iris, mtcars and Indometh. ?iris ?mtcars ?Indometh ####################################################### # Variable selection ####################################################### #using tidyverse: View(mtcars) select(mtcars, mpg, cyl, gear) # Selection VARIABLES by name select(mtcars, 1,2,10) # Selection VARIABLES by position #using R base: # mtcars[,c("mpg","cyl","gear")] # by name # mtcars[,c(1,2,10)] # Por indice # subset(mtcars, select = c(1,2, 10)) # by position # subset(mtcars, select = c("mpg","cyl","gear")) # by name ######################################################## #Create new variables ######################################################## #using tidyverse: #Using function mutate(). mutate(mtcars, # Create a variable that calculates the ratio of mpg to hp. ratio.mpg.hp = mpg/hp) #using R base: # mtcars$ratio.mpg.hp <- mtcars$mpg/mtcars$hp # transform(iris, petal.ratio = Petal.Length/Petal.Width) ######################################################## #Selection of observations or tidyverse lines ######################################################## #With dplyr you use filter (). filter(mtcars, cyl== 8) #using R base: # mtcars[mtcars$cyl==8,] # subset(mtcars, cyl==4) ######################################################## #Sorting the tables according to a variable ######################################################## #tidyverse #With dplyr the arrange () function is typical View(iris) arrange(iris, desc(Species), Sepal.Width) #using R base: # View(iris) # iris[order(rev(iris$Species), iris$Sepal.Length),] #order() position in the dataframe, rev(x), you get a new object with the elements arranged in the opposite order. # iris[order(rev(iris[,1]), iris[,5]),] #por posición de columnas # iris[with(iris, order(rev(Species), Sepal.Length)),] ######################################################## #Summary of observations or lines ######################################################## #tidyverse #The classic function is summarize(). Summarize_at() also works for many variables; and also summarize() combined with across(), a new function introduced in dplyr in the June 2020 update. # Classical form with tidyverse using operators mtcars %>% #tipycal operator --> The %>% operator in R, though technically from the magrittr package, is heavily used with the tidyverse (which includes tidyr). It's called the forward pipe. summarise(mpg.mean =mean(mpg), mpg.median=median(mpg), hp.mean=mean(hp), hp.median=median(hp)) # Shortcuts if the variables and functions to be summarized are many mtcars %>% # with summarise_at() summarise_at (.vars = c("mpg", "hp"), .funs = c("mean", "median")) mtcars %>% # with summarise() y across() summarise(across(.cols= c(mpg, hp), .fns= list(Mean= mean, Median= median))) #using R base: #manually: df.new<-data.frame(mpg.mean = mean(mtcars$mpg), mpg.median= median(mtcars$mpg), hp.mean= mean(mtcars$hp), hp.median= median(mtcars$hp)) df.new #with aggregate #The aggregate function in R isn't ideal for this specific task because it's designed to summarize data by groups. However, we can achieve the desired output using aggregate with a little trick. Here's how: aggregate(mpg ~ group, mtcars, FUN = mean) #only for one ######################################################## #Combination of tables: merge data ######################################################## #with tidyverse #A tibble in R is a special type of data frame introduced by the tibble package within the tidyverse. It offers several advantages over traditional base R data frames: # Simpler and More Readable: # Less Coercive: Tibbles don't automatically convert input data types. For example, a list column in a tibble remains a list, while a base R data frame might convert it to a factor. # Preserves Variable Names: Tibbles don't modify variable names during creation. This can be helpful when dealing with non-standard variable names that wouldn't be valid in base R. # No Automatic Row Names: Tibbles don't create row names by default, which can be cleaner for some use cases. #Consider the dataframes (tibble): band_members band_instruments band_instruments2 #Combining with tidyverse:see the figure https://medium.com/@imanjokko/data-analysis-in-r-series-vi-joining-data-using-dplyr-fc0a83f0f064 # Keep matching names inner_join(band_members, band_instruments, by = "name") # Keep all lines: full_join(band_members, band_instruments, by = "name") # Keep all lines from the first table: left_join(band_members, band_instruments, by = "name") # Keep all the lines of the second table: right_join(band_members, band_instruments, by = "name") # The columns to be combined can have different names: inner_join(band_members, band_instruments2, by = c("name" = "artist")) #using R base: # Keep matching names merge(band_members, band_instruments, by = "name") # Keep all lines: merge(band_members, band_instruments, by = "name", all = TRUE) # Keep all lines from the first table: merge(band_members, band_instruments, by = "name", all.x = TRUE) # Keep all the lines of the second table: merge(band_members, band_instruments, by = "name", all.y = TRUE) # The columns to be combined can have different names: merge(band_members, band_instruments2, by.x = "name", by.y = "artist") ######################################################## #Grouping operations--> AGGREGATE #Summarizing grouped lines ######################################################## #Grouping with dplyr #The most typical dplyr function is group_by() #Pipe operator %>% : #Dplyr and the Pipe Operator: Streamlining Data Manipulation #In the realm of data wrangling and analysis, R's dplyr package stands out as a powerful tool. One of its key features is the %>% operator, also known as the pipe operator. This operator enables you to seamlessly chain together multiple data manipulation operations, making your code more concise and readable. #The Essence of Chaining Operations #Imagine you're preparing a meal, and each step in the process represents a data manipulation operation. Traditionally, you might perform each step separately, like chopping vegetables, cooking meat, and combining ingredients. However, the pipe operator allows you to streamline this process, akin to an assembly line. #With the pipe operator, you can express the entire sequence of operations as a single line of code, where each operation flows into the next. This approach mirrors the actual flow of data through the manipulation process. #problem: present data grouped by carb and gear, sorted by gear and do a statistical summary of data View(mtcars) mtcars %>% group_by(carb, gear) %>% summarise(mpg.mean = mean(mpg), mpg.median = median(mpg), hp.mean = mean(hp), hp.median = median(hp)) %>% arrange(gear)%>% ungroup() # remove from the analysis that follows #R base: mtcars_by <- by(mtcars, INDICES = list(mtcars$carb, mtcars$gear), FUN = function(x){ data.frame(carb = unique(x$carb), gear = unique(x$gear), mpg.mean = mean(x$mpg), mpg.median = median(x$mpg), hp.mean = mean(x$hp), hp.median = median(x$hp)) }) # Merge the result into a data frame do.call(rbind, mtcars_by) #rbind: bind by rows, cbind: bid by cols #other possibility in R base (ERROR NOW IN R) #aggregate(formula = cbind(mpg, hp) ~ carb + gear, # data = mtcars, # FUN = function(x){ # c(mean = mean(x), median = median(x))}) ######################################################## #Create new columns with grouped calculations ######################################################## #For example creating a “centered” column that quantifies the difference of a measurement # from the average calculated by the group to which the observation belongs. #Creation with tidyverse: iris %>% group_by(Species) %>% mutate(Sepal.Length.centered = Sepal.Length - mean(Sepal.Length)) %>% ungroup() #R base: #first alternative: iris$Sepal.Length.centered <- ave(iris$Sepal.Length, iris$Species, FUN = function(x) x - mean(x)) #second alternative: # First work with by () mtcars_by2 <-with(mtcars,by(mtcars, cyl, FUN = function(y){y$mpg.diff <- y$mpg - mean(y$mpg) return(y)})) # Then combine the result into a data frame do.call(rbind, mtcars_by2) #rbind: bind by rows, cbind: bid by cols ################################################################# #Filtering of lines according to conditions evaluated in the groups ################################################################## #For example, setting the maximums for each group. # In the case of mtcars, # it would, for example, filter the maximum hp, according to c and l, the number of cylinders. #tidyverse: #In this case using group_by () and filter (). mtcars %>% group_by(cyl) %>% filter(hp == max(hp)) #R base #In this case, the by () function is also used max_hp <- with(mtcars, by(mtcars, cyl, FUN = function(xm){xm[xm$hp== max(xm$hp),]})) do.call(rbind,max_hp) ################################################################# # Create elegant visualization using ggplot2() ################################################################# #see in: https://ggplot2.tidyverse.org/ # The easiest way to get ggplot2 is to install the whole tidyverse: #install.packages("tidyverse") # Alternatively, install just ggplot2--> #not necessary, is included in tidyverse if (!require("ggplot2")) install.packages("ggplot2") library(ggplot2) # Or the development version from GitHub: # install.packages("devtools") #devtools::install_github("tidyverse/ggplot2") #See the Cheatsheet: https://github.com/rstudio/cheatsheets/blob/master/data-visualization-2.1.pdf #It’s hard to succinctly describe how ggplot2 works because it embodies a #deep philosophy of visualisation. #However, in most cases you start with ggplot(), supply a dataset and aesthetic mapping #(with aes()). You then add on layers (like geom_point() or geom_histogram()), #scales (like scale_colour_brewer()), faceting specifications (like facet_wrap()) #and coordinate systems (like coord_flip()). library(ggplot2) # indicates we want to create a bar chart by class ggplot(mpg, aes(class)) + geom_bar() # indicates we want to create a bar chart by class and drv ggplot(mpg, aes(class, fill = drv)) + geom_bar() #more examples in: https://ggplot2-book.org/ ggplot(mpg, aes(class, fill = hwy, group = hwy)) + geom_bar() #mean plots in R:--------------------- ############ # example1 ############# ########### #example 2 ########### View(ToothGrowth) tg <- ToothGrowth head(tg) ## Summarizes data. ## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). ## data: a data frame. ## measurevar: the name of a column that contains the variable to be summariezed ## groupvars: a vector containing names of columns that contain grouping variables ## na.rm: a boolean that indicates whether to ignore NA's ## conf.interval: the percent range of the confidence interval (default is 95%) summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) { library(plyr) # New version of length which can handle NA's: if na.rm==T, don't count them length2 <- function (x, na.rm=FALSE) { if (na.rm) sum(!is.na(x)) else length(x) } # This does the summary. For each group's data frame, return a vector with # N, mean, and sd datac <- ddply(data, groupvars, .drop=.drop, .fun = function(xx, col) { c(N = length2(xx[[col]], na.rm=na.rm), mean = mean (xx[[col]], na.rm=na.rm), sd = sd (xx[[col]], na.rm=na.rm) ) }, measurevar ) # Rename the "mean" column datac <- rename(datac, c("mean" = measurevar)) datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean # Confidence interval multiplier for standard error # Calculate t-statistic for confidence interval: # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 ciMult <- qt(conf.interval/2 + .5, datac$N-1) datac$ci <- datac$se * ciMult return(datac) } library(ggplot2) # summarySE provides the standard deviation, standard error of the mean, and a (default 95%) confidence interval tgc <- summarySE(tg, measurevar="len", groupvars=c("supp","dose")) tgc #After the data is summarized, we can make the graph. These are basic line and point graph with error bars representing either the standard error of the mean, or 95% confidence interval. # Standard error of the mean ggplot(tgc, aes(x=dose, y=len, colour=supp)) + geom_errorbar(aes(ymin=len-se, ymax=len+se), width=.1) + geom_line() + geom_point() # The errorbars overlapped, so use position_dodge to move them horizontally pd <- position_dodge(0.1) # move them .05 to the left and right ggplot(tgc, aes(x=dose, y=len, colour=supp)) + geom_errorbar(aes(ymin=len-se, ymax=len+se), width=.1, position=pd) + geom_line(position=pd) + geom_point(position=pd) # Use 95% confidence interval instead of SEM ggplot(tgc, aes(x=dose, y=len, colour=supp)) + geom_errorbar(aes(ymin=len-ci, ymax=len+ci), width=.1, position=pd) + geom_line(position=pd) + geom_point(position=pd) # Black error bars - notice the mapping of 'group=supp' -- without it, the error # bars won't be dodged! ggplot(tgc, aes(x=dose, y=len, colour=supp, group=supp)) + geom_errorbar(aes(ymin=len-ci, ymax=len+ci), colour="black", width=.1, position=pd) + geom_line(position=pd) + geom_point(position=pd, size=3) ################################ #proposed exercises ################################ #Exercise 1---------------------------------: #using: ?Indometh #The Indometh data frame has 66 rows and 3 columns of data on the pharmacokinetics of indometacin (or, older spelling, ‘indomethacin’). dataIndometh<-Indometh # compute mean and sdv of Indometh by group time dataIndometh<-Indometh %>% group_by(time) %>% summarise(conc.mean = mean(conc), conc.sdv = sd(conc)) %>% arrange(time)%>% ungroup() # remove from the analysis that follows #Exercise 2-------------------------------: #The object data(prostate), see in: ??prostate at the link faraway::prostate from library(faraway), #contains the data frame prostate . The prostate data frame has 97 rows and 9 columns. A study #on 97 men with prostate cancer who were due to receive a radical prostatectomy. Source from Source #Andrews DF and Herzberg AM (1985): Data. New York: Springer-Verlag. The variables of prostate #are the followings: # id: [,1] lcavol Ig, common standardized fertility measure. # id: [,2] lweight, log(prostate weight). # id: [,3] age, age. # id: [,4] lbph, log(benign prostatic hyperplasia amount). # id: [,5] svi, seminal vesicle invasion. # id: [,6] lcp, log(capsular penetration). # id: [,7] gleason, Gleason score. # id: [,8] pgg45, percentage Gleason scores 4 or 5. # id: [,9] lpsa, log(prostate speci c antigen). #Nota: The data comes from data frame prostate of the package faraway of R to be charged using the #instruction: data(prostate). library(faraway) data(prostate) prostate<-prostate #represent the number of patients by age library(ggplot2) #represent the number of patients by age in subgroups of svi ggplot(prostate, aes(age, fill = factor(svi))) + geom_bar() #create new group based on age, and divided in 3 quartiles: library(Hmisc) prostate$age.group<-cut2(prostate$age, quantile(prostate$age, c(0.3, 0.9))) res1<-prostate %>% group_by(age.group) %>% filter(lcp == min(lcp)) res1 #sorted by age.group and lcavol res2<-arrange(res1, desc(age.group), lcavol) res2 # Exercise 3--------------------------------------: #Using dataframe prostate #do a function that allows change the lcavol > cut.off (ex: cut.off = 0.5) by mean(lcavol) in each age category (age.group) #return number of changes and dataframe changed change.lcavol<- function(cut.off = 0.5){ #new dataframe prostate.changed<-prostate #create the variable to change lcavol prostate.changed$lcavol.categorized <- NA #loop for changes<-0 for(i in 1:dim(prostate.changed)[1]){ #prostate$lcavol.categorized[i]<- ifelse(prostate$lcavol[i] > mean(prostate$lcavol), "Young", 1) if(prostate.changed$lcavol[i]>cut.off){ prostate.changed$lcavol.categorized[i]<-mean(prostate.changed$lcavol) changes<- changes+1 cat("Change in line: ", i,sep = "",fill = T) } } prostate.changed <- prostate return(list(changes,prostate.changed)) } #use the function library(faraway) data(prostate) res<-change.lcavol(cut.off = 0.7) res[[1]] #only see number of changes #Exercise 4------------------------------------------: #Using dataframe iris #Create a function with the name of Function1, with a parameter called Option1. #If you indicates Option.1 = True in Function1, that allows you: #Create subgroups in a dataframe for a requested variable. #As an argument the function would need the name of the dataframe, #the name of the variable to be grouped and the limits of the categories to be constructed #(ex: 2, 3, 4, ...). As a result, you should offer a statistic with the subgroups AND SPECIES and a #list containing the new dataframe with the new grouping. iris Function1<- function(Option1=1, datafr=iris, variable.pos=3, limits.g=c(1.1,2,5), var.to.describe=1){ library(Hmisc) datafr$new.group<-NA #NA VARIABLE if(Option1==1){ datafr$new.group<-cut2(datafr[,3], limits.g) print("New subgroups") print(summary(datafr$new.group)) #print summary of new group #DESCRIBE DATAFRAME BY NEW SUBGROUPS AND SPECIES cat("The analyzed variable by subgroups and species is: ", names(datafr)[1],fill = T,sep="") datafr1<-datafr %>% group_by(new.group, Species) %>% summarise(var.mean = mean(datafr[,1]), var.sdv = sd(datafr[,1])) %>% arrange(new.group)%>% ungroup() # remove from the analysis that follows } return(datafr1) } #run function Function1()