Esto es una versión de la presentación utilizada en la sesión Análisis estadístico y visualización de datos del workshop de data science, incluyendo el código que por simplicidad y espacio no se ha mostrado en la presentación.
Objetivo:
Advertencia: Las conclusiones valen sobre sólo los datos. Para conclusiones sobre el conjunto de la población, inferencia estadística.
El RMS Titanic transportaba 2.435 pasajeros y 892 miembros de la tripulación cuando se hundió el 10 de abril de 1912 dejando unos 700 supervivientes.
En esta sesión utilizaremos como ejemplo los datos de una parte de los pasajeros.
library(readr)
library(dplyr)
library(tidyr)
library(forcats)
library(ggplot2)
library(ggmosaic)
# Preparando datos
titanic <- read_csv("titanic_data.txt")
titanic <- mutate(titanic,
passenger.class = fct_recode(as.factor(pclass),
"1st" = "1", "2nd" = "2", "3rd" = "3"),
survival = fct_recode(as.factor(survived),
"died" = "0", "lived" = "1"))
titanic$class <- as.factor(titanic$pclass)
titanic$age_group <- ifelse(titanic$age>=18, ">=18", "<18")
addmargins(table(titanic$survival))
##
## died lived Sum
## 809 500 1309
table(titanic$survival)/sum(table(titanic$survival))
##
## died lived
## 0.618029 0.381971
ggplot(titanic)+
geom_bar(aes(x=survival, fill=survival))
addmargins(table(titanic$sex))
##
## female male Sum
## 466 843 1309
table(titanic$sex)/sum(table(titanic$sex))
##
## female male
## 0.3559969 0.6440031
ggplot(titanic)+
geom_bar(aes(x=sex, fill=sex))
addmargins(with(titanic, table(sex,survival)))
## survival
## sex died lived Sum
## female 127 339 466
## male 682 161 843
## Sum 809 500 1309
ggplot(titanic) +
geom_bar(aes(sex, fill = survival))
ggplot(titanic) +
geom_mosaic(aes(x = product(sex), fill = survival))
addmargins(with(titanic, proportions(table(sex,survival), margin=1)), margin=2)
## survival
## sex died lived Sum
## female 0.2725322 0.7274678 1.0000000
## male 0.8090154 0.1909846 1.0000000
addmargins(with(titanic, proportions(table(sex,survival), margin=2)), margin=1)
## survival
## sex died lived
## female 0.1569839 0.6780000
## male 0.8430161 0.3220000
## Sum 1.0000000 1.0000000
addmargins(table(titanic$class))
##
## 1 2 3 Sum
## 323 277 709 1309
table(titanic$class)/sum(table(titanic$class))
##
## 1 2 3
## 0.2467532 0.2116119 0.5416348
ggplot(titanic)+
geom_bar(aes(x=class, fill=class))
ggplot(titanic) +
geom_bar(aes(class, fill = survival))
addmargins(with(titanic, proportions(table(class,survival))))
## survival
## class died lived Sum
## 1 0.09396486 0.15278839 0.24675325
## 2 0.12070283 0.09090909 0.21161192
## 3 0.40336134 0.13827349 0.54163484
## Sum 0.61802903 0.38197097 1.00000000
addmargins(with(titanic, proportions(table(class,survival), margin=1)), margin=2)
## survival
## class died lived Sum
## 1 0.3808050 0.6191950 1.0000000
## 2 0.5703971 0.4296029 1.0000000
## 3 0.7447109 0.2552891 1.0000000
addmargins(with(titanic, proportions(table(class,survival), margin=2)), margin=1)
## survival
## class died lived
## 1 0.1520396 0.4000000
## 2 0.1953028 0.2380000
## 3 0.6526576 0.3620000
## Sum 1.0000000 1.0000000
ggplot(titanic) +
geom_bar(aes(class, fill = survival))
ggplot(titanic) +
geom_bar(aes(sex, fill = survival))+
facet_grid(cols=vars(class))
## , , class = 1
##
## survival
## sex died lived
## female 5 139
## male 118 61
##
## , , class = 2
##
## survival
## sex died lived
## female 12 94
## male 146 25
##
## , , class = 3
##
## survival
## sex died lived
## female 110 106
## male 418 75
summary(titanic$age, useNA="always")
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.1667 21.0000 28.0000 29.8811 39.0000 80.0000 263
ggplot(titanic)+
geom_histogram(aes(x=age), na.rm=TRUE)
ggplot(titanic)+
geom_histogram(aes(x=age, fill=survival), na.rm=TRUE)
ggplot(titanic) +
geom_histogram(aes(x=age, fill = survival))+
facet_grid(cols=vars(class), rows=vars(sex))
ggplot(titanic) +
geom_bar(aes(age_group, fill = survival))+
facet_grid(cols=vars(pclass))
ggplot(titanic) +
geom_bar(aes(x=age_group, fill = survival))+
facet_grid(cols=vars(class), rows=vars(sex))
ggplot(titanic)+
geom_histogram(aes(x=fare), na.rm=TRUE)
ggplot(titanic)+
geom_histogram(aes(x=fare, fill=class), na.rm=TRUE)
ggplot(titanic)+
geom_boxplot(aes(x=class, y=fare, fill=class), na.rm=TRUE)
summary(titanic$fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 7.896 14.454 33.295 31.275 512.329 1
aggregate(fare~class, data=titanic, summary)
## class fare.Min. fare.1st Qu. fare.Median fare.Mean fare.3rd Qu. fare.Max.
## 1 1 0.00000 30.69580 60.00000 87.50899 107.66250 512.32920
## 2 2 0.00000 13.00000 15.04580 21.17920 26.00000 73.50000
## 3 3 0.00000 7.75000 8.05000 13.30289 15.24580 69.55000
ggplot(titanic)+
geom_point(aes(x=age,y=fare), na.rm=TRUE)
ggplot(titanic)+
geom_point(aes(x=age,y=fare), na.rm=TRUE)+
geom_smooth(aes(x=age,y=fare), method="lm", na.rm=TRUE)
coefficients(lm(fare~age, data=titanic))
## (Intercept) age
## 16.0197563 0.6922966
\(\hat{fare} = 16.0197563 + 0.6922966 \cdot age\)
ggplot(titanic)+
geom_point(aes(x=age,y=fare, color=class), na.rm=TRUE)+
geom_smooth(aes(x=age,y=fare, color=class), method="lm", na.rm=TRUE)
ggplot(titanic)+
geom_point(aes(x=age,y=fare, color=class), na.rm=TRUE)+
geom_smooth(aes(x=age,y=fare, color=class), method="lm", na.rm=TRUE)+
geom_smooth(aes(x=age,y=fare), method="lm", na.rm=TRUE, color="black")
coefficients(lm(fare~age*class, data=titanic))
## (Intercept) age class2 class3 age:class2 age:class3
## 116.2097283 -0.6123703 -88.9199766 -98.4956718 0.4281848 0.4169871
1st class: \(\hat{fare} = 116.2097283 -0.6123703 \cdot age\)
2nd class: \(\hat{fare} = 27.2897517 -0.1841855 \cdot age\)
3rd class: \(\hat{fare} = 17.7140565 -0.1953832 \cdot age\)
ggplot(titanic)+
geom_histogram(aes(x=age, fill=class), na.rm=TRUE)
ggplot(titanic)+
geom_boxplot(aes(x=class, y=age, fill=class), na.rm=TRUE)
ggplot2
. Una breve introducción a este paquete se
puede encontrar en http://verso.mat.uam.es/~joser.berrendero/R/introggplot2.html.RMarkdown
. Más
información en https://rmarkdown.rstudio.com/.