### ======================================================== ### FUNCIONES EN R ### ======================================================== # Toni Monleon-Getino (Curso de Data Science, 2024, Fac Biologia UB) # BASED ON https://swcarpentry.github.io/r-novice-inflammation/02-func-R/ # CREATE FUNCTIONS ### ======================================================== ### OVERVIEW ### ======================================================== # Questions # How do I make a function? # How can I test my functions? # How should I document my code? # Objectives # Define a function that takes arguments. # Return a value from a function. # Test a function. # Set default values for function arguments. # Explain why we should divide programs into small, single-purpose functions. # If we only had one data set to analyze, it would probably be faster to load the file into a spreadsheet and use that to plot some simple statistics. But we have twelve files to check, and may have more in the future. In this lesson, we'll learn how to write a function so that we can repeat several operations with a single command. ### ======================================================== ### A FIRST AMAZING EXAMPLE ### ======================================================== #funcion para tirar dos dados: #Below some code to look at the observed and expected probability to find the different values of the sum of two six faced fair dice. #http://rpubs.com/Lionel/11497 # see at: https://shiny.rstudio.com/gallery/ # An?lisis de la infestacion de varroa (acaro de las abejas): # http://biost3.bio.ub.edu:3838/appvarroa3/ # Analisis de bioequivalencia farmacologico: # http://biost3.bio.ub.edu:3838/EqDIAGNOBATCH/ ### ======================================================== ### Write functions ### ======================================================== # usar funciones ya implementadas en R como matrix, tapply, etc ## Writing just the name of a function returns its code ## ---------------------------------------------------- matrix tapply summary ## generate data for medical example medical.example <- data.frame(patient = 1:100, age = rnorm(100, mean = 60, sd = 12), treatment = gl(2, 50, labels = c("Treatment", "Control"))) summary(medical.example) #quiero resumir la edad por tratamiento ## tapply(Summary Variable, Group Variable, Function) ## Medical Example tapply(medical.example$age, medical.example$treatment, mean) ## Function args returns the arguments of a function, function body its code ## ------------------------------------------------------------------------- args(tapply) body(tapply) ## How to create a new function? EXERCISE1 ## ============================= # Defining functions in R is very similar to how we do it in Python. # We define the name of the function and the arguments using **function()**. # do a function that sums two numbers. # Soluction doSum <- function(number1,number2){ # do the operation and save it to a variable result <- number1+number2 # output to return. If not explicit it'll return the last command run return(result) } doSum(2,3) # we can save the output of the function into a variable result1_6 <- doSum(1,6) result1_6 #but... doSum(2,"ww" ) #number1<-1 #number2<-1 doSum_a <- function(number1,number2){ if (is.numeric(number1)==T & is.numeric(number2)==T){ # do the operation and save it to a variable result <- number1+number2 # output to return. If not explicit it'll return the last command run return(result) } if (is.numeric(number1)==F | is.numeric(number2)==F){ print("Arguments wrong !!!") } } doSum_a(1,"2") doSum_a(1,NA) doSum_a(1,1) ######### #EXAMPLE ########## # An example with the trimmed mean trimmedmean <- function(x) #x representa a los argumentos (parametros) { print(paste("la media truncada al 10% es:",mean(x, trim=0.1))) } # Let's check whether trimmedmean was created and whether it works ls() #indica los objetos en memoria trimmedmean #funcion (x <- 1:10) #genero un vector de 1 a 10 mean(x) #calculo media trimmedmean(x) #calculo trimmedmean var(x) # We can also quickly read the code from an external file rm(trimmedmean) #elimino la funcion file.show("Rfunction_trimmedMedia.R") source("Rfunction_trimmedMedia.R") #llamo la funcion externamente desde un fichero ls() trimmedmean #example load("DataFrameExamples.RData") #see the data frames imported #plot to explore data of lymphocites: library(ggplot2) ggplot(immuno, aes( y = lympho1,fill = "#red")) + geom_boxplot() + coord_flip() #what is a box-plot? https://www.r-bloggers.com/2021/11/how-to-make-stunning-boxplots-in-r-a-complete-guide-with-ggplot2/ hist(immuno$lympho1,breaks = 20,col="blue") #sample mean & median mean(immuno$lympho1); median(immuno$lympho1) #trimmed mean trimmedmean(immuno$lympho1) ## Functions that return more than one value (or not) ## ================================================== #load data load("DataFrameExamples.RData") file.show("Rfunctions_vars1to4.R") source("Rfunctions_vars1to4.R") ls() # The new functions: vars1; vars2; vars3; vars4 vars1(immuno$nkiller1) vars2(immuno$nkiller1) vars3(immuno$nkiller1) vars4(immuno$nkiller1) ## A more useful example: a function that draws some graphs ## ======================================================== file.show("Rfunction_simpGraphs.R") source("Rfunction_simpGraphs.R") # Function simpGraphs simpGraphs x <- rnorm(10000, 10, 3) y <- rchisq(10000, 10) simpGraphs(x) simpGraphs(y) ## Note: # Function qqline has a problem with labels. :-( library(Hmisc) simpGraphs(immuno$nkiller1) # The R doctor says: # Use function unclass to solve the problem simpGraphs(unclass(immuno$nkiller1)) ## We can improve that function adding some more options ## ===================================================== file.show("Rfunction_fourGraphs.R") source("Rfunction_fourGraphs.R") # Function fourGraphs fourGraphs x <- rnorm(1000) y <- rchisq(1000, 5) fourGraphs(x) fourGraphs(x, "khaki", 2) # Error message: fourGraphs(immuno$group) # solution fourGraphs(unclass(immuno$group)) # Let's close all windows (if you want) graphics.off() ####################################### # EJERCICIOS PARA LOS ESTUDIANTES ###################################### ############################ # EJERCICIO 2: remuestreo estadistico ############################ #Let's say that we want to know if a person has high blood pressure. #What do we need to know? The value of blood pressure in that person and what is the average value for a person. #How do we know the later? #Do we measure it for every human on earth and then calculate the mean? That's impossible!. #THis is what happens when we are interested in the population mean ($\mu$) and population variance ($\sigma^2$) of a variable. #Usually it is impossible to know these *parameters* because we cannot measure it for all individuals. #What we do? We sample random subsets of people, measure the variable (in this case blood pressure) and *estimate* the population parameters ($\mu$ and $\sigma^2$) by the observed values in the samples values (sample mean, $x$ and sample variance, $s^2$). ## Sampling distribution #Assume that you repeatedly sample from a population, grabbing *n* items each time. Then, for each sample of *n* items, you calculate and plot the mean. The resulting distribution of sample means is the *sampling distribution of the mean*. And the standard deviation of these sample means is called the *standard error of the mean (SEM)*. #In the next figure, you can see a histogram for the distribution of blood pressure (systolic) in the general population (Fig. 1A). It is a variable that follows a normal distribution with mean = 125 and sd = 15 (in mmHg). Then we sampled a hundred values from that distribution and calculated the mean. We did that a thousand times and plotted the sampling distribution of the mean (Fig. 1B). op <- par(mfrow=c(1,2)) # change plotting settings to plot things in 1 row, 2 columns #simulate a population of 1 milion persons with 125mmHg (mean) and sdv=15 mmHg blood_pressure <- rnorm(n = 1e6,mean = 125, sd = 15) hist(blood_pressure, main = "A",xlab = NULL) abline(v=mean(blood_pressure),lty=2) #think a function to sample the population and stimate the mean and sdv doSampling <- function(dat,numSamples,sizeSamples){ # calculate and plot a sampling distribution of the mean # parameters given are: # dat = data to sample from (numeric vector), # numSamples = number of times you want to sample (number) # sizeSamples = size of the sample (number) # returns a numeric vector with the values of the sample means out <- numeric(numSamples) for (i in seq(1,numSamples)){ # simple random sampling get_sample <- sample(dat, size = sizeSamples,replace = TRUE) # calculate mean and store it in the numeric vector out[i] <- mean(get_sample) } return(out) } #use the function sample_mean <- doSampling(blood_pressure,numSamples = 1000,sizeSamples = 100) View(sample_mean) hist(sample_mean, main = "B",xlab = NULL) abline(v=mean(sample_mean),lty=2) par(op) # return ploting options to default #media: resultados estadistico mean(sample_mean) #media sd(sample_mean) #error medio: sdv/raiz(n=100) 15/sqrt(100) #We see that the sampling distribution of the mean blood pressure (1.B) has the same mean as the original variable, but the width of the distribution (variance) is smaller. #This is the result that we expected. Thanks to the **Central Limit Theorem**, we don't need to measure the whole population to know that the population mean will be approximately 125 $\pm$ 1.4 (mean $\pm$ standard deviation). The standard deviation shows how much this estimate could change if we choose different members of the population for our sample. ### ======================================================== # DEBUGGING: CONTROL DE ERRORES ############################################################# # SEE AT: https://support.rstudio.com/hc/en-us/articles/205612627-Debugging-with-RStudio options(error = browser()) #OPCIONES DE DEBUGGING EN RSTUDIO doSum(1,"2") #ERROR EN LA FUNCION SUMA ### ======================================================== ### LIBRARIES IN R ### ======================================================== #COLOCAR UNA SERIE DE FUNCIONES Y UTILIZARLAS, SIN TENER QUE LLAMARLAS, SOLO INSTALAR UN PAQUETE (LIBRARY) #LIBRAIES... LET' S GO TO DO IT! ### ======================================================== ### ADDITIONAL EXERCISES ### ======================================================== #CREAR UNA FUNCION PARA CONVERTIR GRADOS FARENHEIT EN KELVIN (INPUT: FARENHEIT Y OUTPUT: KELVIN) ### ======================================================== ### Defining a Function ### ======================================================== # Let's start by defining a function fahrenheit_to_kelvin that converts temperatures from Fahrenheit to Kelvin: fahrenheit_to_kelvin <- function(temp_F) { temp_K <- ((temp_F - 32) * (5 / 9)) + 273.15 return(temp_K) } # We define fahrenheit_to_kelvin by assigning it to the output of function. The list of argument names are contained within parentheses. Next, the body of the function-the statements that are executed when it runs-is contained within curly braces ({}). The statements in the body are indented by two spaces, which makes the code easier to read but does not affect how the code operates. # When we call the function, the values we pass to it are assigned to those variables so that we can use them inside the function. Inside the function, we use a return statement to send a result back to whoever asked for it. # Automatic Returns # In R, it is not necessary to include the return statement. R automatically returns whichever variable is on the last line of the body of the function. Since we are just learning, we will explicitly define the return statement. # Let's try running our function. Calling our own function is no different from calling any other function: # freezing point of water fahrenheit_to_kelvin(32) # [1] 273.15 # boiling point of water fahrenheit_to_kelvin(212) # [1] 373.15 # We've successfully called the function that we defined, and we have access to the value that we returned. # Composing Functions # Now that we've seen how to turn Fahrenheit into Kelvin, it's easy to turn Kelvin into Celsius: kelvin_to_celsius <- function(temp_K) { temp_C <- temp_K - 273.15 return(temp_C) } # absolute zero in Celsius kelvin_to_celsius(0) # [1] -273.15 #What about converting Fahrenheit to Celsius? #We could write out the formula, but we don't need to. Instead, we can compose the two functions we have already created: fahrenheit_to_celsius <- function(temp_F) { temp_K <- fahrenheit_to_kelvin(temp_F) temp_C <- kelvin_to_celsius(temp_K) return(temp_C) } # freezing point of water in Celsius fahrenheit_to_celsius(32.0) #[1] 0 #This is our first taste of how larger programs are built: we define basic operations, then combine them in ever-larger chunks to get the effect we want. Real-life functions will usually be larger than the ones shown here-typically half a dozen to a few dozen lines-but they shouldn't ever be much longer than that, or the next person who reads it won't be able to understand what's going on. #Chaining Functions (Funcion de una funcion) #This example showed the output of fahrenheit_to_kelvin assigned to temp_K, which is then passed to kelvin_to_celsius to get the final result. It is also possible to perform this calculation in one line of code, by "chaining" functions together, like so: # freezing point of water in Celsius kelvin_to_celsius(fahrenheit_to_kelvin(32.0)) #Named Variables and the Scope of Variables #Functions can accept arguments explicitly assigned to a variable name in in the function call functionName(variable = value), as well as arguments by order: input_1 <- 20 mySum <- function(input_1, input_2 = 10) { output <- input_1 + input_2 return(output) } #Given the above code was run, which value does mySum(input_1 = 1, 3) #produce? 4 #If mySum(3) returns 13, why does mySum(input_2 = 3) return an error? #=================================== # CENTER / STANDARIZED DATA #=================================== #CREAR UNA FUNCION PARA CENTRAR (ESTANDARIZAR DATOS) #Once we start putting things in functions so that we can re-use them, we need to start testing that those functions are working correctly. To see how to do this, let's write a function to center a dataset around a particular value: center <- function(data, desired) { new_data <- (data - mean(data)) + desired return(new_data) } #We could test this on our actual data, but since we don't know what the values ought to be, it will be hard to tell if the result was correct. Instead, let's create a vector of 0s and then center that around 3. This will make it simple to see if our function is working as expected: z <- c(0, 0, 0, 0) z #[1] 0 0 0 0 center(z, 3) #[1] 3 3 3 3 center(c(1.2, 3.5, 7.8),0) #That looks right, so let's try center on our real data. We'll center the inflammation data from day 4 around 0: #dat <- read.csv(file = "data/inflammation-01.csv", header = FALSE) #centered <- center(dat[, 4], 0) #head(centered) #[1] 1.25 -0.75 1.25 -1.75 1.25 0.25 #It's hard to tell from the default output whether the result is correct, but there are a few simple tests that will reassure us: # original min min(dat[, 4]) #[1] 0 # original mean mean(dat[, 4]) #[1] 1.75 # original max max(dat[, 4]) #[1] 3 # centered min min(centered) #[1] -1.75 # centered mean mean(centered) #[1] 0 # centered max max(centered) #[1] 1.25 #That seems almost right: the original mean was about 1.75, so the lower bound from zero is now about -1.75. The mean of the centered data is 0. We can even go further and check that the standard deviation hasn't changed: # original standard deviation sd(dat[, 4]) #[1] 1.067628 # centered standard deviation sd(centered) #[1] 1.067628 #Those values look the same, but we probably wouldn't notice if they were different in the sixth decimal place. Let's do this instead: # difference in standard deviations before and after sd(dat[, 4]) - sd(centered) #[1] 0 #Sometimes, a very small difference can be detected due to rounding at very low decimal places. R has a useful function for comparing two objects allowing for rounding errors, all.equal: all.equal(sd(dat[, 4]), sd(centered)) #[1] TRUE #It's still possible that our function is wrong, but it seems unlikely enough that we should probably get back to doing our analysis. We have one more task first, though: we should write some documentation for our function to remind ourselves later what it's for and how to use it. #A common way to put documentation in software is to add comments like this: center <- function(data, desired) { # return a new vector containing the original data centered around the # desired value. # Example: center(c(1, 2, 3), 0) => c(-1, 0, 1) new_data <- (data - mean(data)) + desired return(new_data) } #Writing Documentation #Formal documentation for R functions is written in separate .Rd using a markup language similar to LaTeX. You see the result of this documentation when you look at the help file for a given function, e.g. ?read.csv. The roxygen2 package allows R coders to write documentation alongside the function code and then process it into the appropriate .Rd files. You will want to switch to this more formal method of writing documentation when you start writing more complicated R projects. #=========================== # More in Functions #=========================== #Write a function called analyze that takes a filename as an argument and displays the three graphs produced in the previous lesson (average, min and max inflammation over time). analyze("data/inflammation-01.csv") should produce the graphs already shown, while analyze("data/inflammation-02.csv") should produce corresponding graphs for the second data set. Be sure to document your function with comments. #Solution #Rescaling #Write a function rescale that takes a vector as input and returns a corresponding vector of values scaled to lie in the range 0 to 1. (If $L$ and $H$ are the lowest and highest values in the original vector, then the replacement for a value $v$ should be $(v-L) / (H-L)$.) Be sure to document your function with comments. #Test that your rescale function is working properly using min, max, and plot. #Solution #Defining Defaults #We have passed arguments to functions in two ways: directly, as in dim(dat), and by name, as in read.csv(file = "data/inflammation-01.csv", header = FALSE). In fact, we can pass the arguments to read.csv without naming them: # dat <- read.csv("data/inflammation-01.csv", FALSE) #However, the position of the arguments matters if they are not named. #dat <- read.csv(header = FALSE, file = "data/inflammation-01.csv") #dat <- read.csv(FALSE, "data/inflammation-01.csv") #Error in read.table(file = file, header = header, sep = sep, quote = quote, : 'file' must be a character string or connection # To understand what's going on, and make our own functions easier to use, let's re-define our center function like this: center <- function(data, desired = 0) { # return a new vector containing the original data centered around the # desired value (0 by default). # Example: center(c(1, 2, 3), 0) => c(-1, 0, 1) new_data <- (data - mean(data)) + desired return(new_data) } #The key change is that the second argument is now written desired = 0 instead of just desired. If we call the function with two arguments, it works as it did before: test_data <- c(0, 0, 0, 0) center(test_data, 3) #[1] 3 3 3 3 #But we can also now call center() with just one argument, in which case desired is automatically assigned the default value of 0: more_data <- 5 + test_data more_data #[1] 5 5 5 5 center(more_data) #[1] 0 0 0 0 #This is handy: if we usually want a function to work one way, but occasionally need it to do something else, we can allow people to pass an argument when they need to but provide a default to make the normal case easier. #The example below shows how R matches values to arguments display <- function(a = 1, b = 2, c = 3) { result <- c(a, b, c) names(result) <- c("a", "b", "c") # This names each element of the vector return(result) } # no arguments display() #a b c #1 2 3 # one argument display(55) #a b c #55 2 3 # two arguments display(55, 66) #a b c #55 66 3 # three arguments display (55, 66, 77) #a b c #55 66 77 #As this example shows, arguments are matched from left to right, and any that haven't been given a value explicitly get their default value. We can override this behavior by naming the value as we pass it in: # only setting the value of c display(c = 77) # a b c #1 2 77 #Matching Arguments #To be precise, R has three ways that arguments are supplied by you are matched to the formal arguments of the function definition: #by complete name, #by partial name (matching on initial n characters of the argument name), and #by position. #Arguments are matched in the manner outlined above in that order: by complete name, then by partial matching of names, and finally by position. #With that in hand, let's look at the help for read.csv(): ?read.csv #There's a lot of information there, but the most important part is the first couple of lines: #read.csv(file, header = TRUE, sep = ",", quote = "\"", # dec = ".", fill = TRUE, comment.char = "", ...) #This tells us that read.csv() has one argument, file, that doesn't have a default value, and six others that do. Now we understand why the following gives an error: #dat <- read.csv(FALSE, "data/inflammation-01.csv")