In this tutorial we will load in the data created in the first tutorial.
Once the data is loaded, we will perform a conditional mutation and introduce the group_by function
Finally, we will go over how to subset your data using filter and select
library
#install.packages("tidyverse")
library(tidyverse)
########Read in the data we created in TidyTutorial 1
dat<-read.csv("Tidy1_dat.csv")
########View the first 10 rows
head(dat, 10)
## id group time score
## 1 1 0 0 0.30
## 2 1 0 1 0.26
## 3 1 0 2 0.53
## 4 1 0 3 1.90
## 5 1 0 4 -1.77
## 6 1 0 5 -3.61
## 7 1 0 6 1.38
## 8 1 0 7 1.22
## 9 1 0 8 -2.64
## 10 1 0 9 -1.38
########This data contains four variables: id, group, time, and score
########We will use this data to go over some basic functions for data manipulation and cleaning
##Tidyverse part 1: Mastering the mutate function
####We can combine mutate and group_by to create summary info for each id
dat %>% group_by(id) %>% mutate(mean_score = mean(score))
## # A tibble: 100 x 5
## # Groups: id [10]
## id group time score mean_score
## <int> <int> <int> <dbl> <dbl>
## 1 1 0 0 0.3 -0.381
## 2 1 0 1 0.26 -0.381
## 3 1 0 2 0.53 -0.381
## 4 1 0 3 1.9 -0.381
## 5 1 0 4 -1.77 -0.381
## 6 1 0 5 -3.61 -0.381
## 7 1 0 6 1.38 -0.381
## 8 1 0 7 1.22 -0.381
## 9 1 0 8 -2.64 -0.381
## 10 1 0 9 -1.38 -0.381
## # … with 90 more rows
##Here we now have an average score for each person
####We can also do a conditional mutate; creating a new variable based on an existing variable
dat %>% mutate(pos_neg=ifelse(score < 0, "negative", "positive"))
## id group time score pos_neg
## 1 1 0 0 0.30 positive
## 2 1 0 1 0.26 positive
## 3 1 0 2 0.53 positive
## 4 1 0 3 1.90 positive
## 5 1 0 4 -1.77 negative
## 6 1 0 5 -3.61 negative
## 7 1 0 6 1.38 positive
## 8 1 0 7 1.22 positive
## 9 1 0 8 -2.64 negative
## 10 1 0 9 -1.38 negative
## 11 2 1 0 -0.24 negative
## 12 2 1 1 0.23 positive
## 13 2 1 2 2.25 positive
## 14 2 1 3 0.66 positive
## 15 2 1 4 1.24 positive
## 16 2 1 5 0.92 positive
## 17 2 1 6 1.76 positive
## 18 2 1 7 -1.04 negative
## 19 2 1 8 3.40 positive
## 20 2 1 9 -1.81 negative
## 21 3 0 0 -1.29 negative
## 22 3 0 1 2.68 positive
## 23 3 0 2 -1.41 negative
## 24 3 0 3 4.49 positive
## 25 3 0 4 0.65 positive
## 26 3 0 5 0.23 positive
## 27 3 0 6 -0.43 negative
## 28 3 0 7 -0.48 negative
## 29 3 0 8 -0.62 negative
## 30 3 0 9 -3.32 negative
## 31 4 1 0 1.20 positive
## 32 4 1 1 -1.84 negative
## 33 4 1 2 -1.85 negative
## 34 4 1 3 -0.38 negative
## 35 4 1 4 0.52 positive
## 36 4 1 5 -0.40 negative
## 37 4 1 6 0.78 positive
## 38 4 1 7 2.04 positive
## 39 4 1 8 0.89 positive
## 40 4 1 9 0.16 positive
## 41 5 0 0 2.43 positive
## 42 5 0 1 0.94 positive
## 43 5 0 2 -2.04 negative
## 44 5 0 3 0.33 positive
## 45 5 0 4 -1.17 negative
## 46 5 0 5 1.20 positive
## 47 5 0 6 0.63 positive
## 48 5 0 7 -0.15 negative
## 49 5 0 8 -1.87 negative
## 50 5 0 9 0.79 positive
## 51 6 1 0 1.97 positive
## 52 6 1 1 -2.13 negative
## 53 6 1 2 -0.79 negative
## 54 6 1 3 -1.10 negative
## 55 6 1 4 -0.11 negative
## 56 6 1 5 -1.15 negative
## 57 6 1 6 -0.78 negative
## 58 6 1 7 1.95 positive
## 59 6 1 8 2.09 positive
## 60 6 1 9 0.87 positive
## 61 7 0 0 1.22 positive
## 62 7 0 1 1.27 positive
## 63 7 0 2 0.36 positive
## 64 7 0 3 0.01 positive
## 65 7 0 4 -1.86 negative
## 66 7 0 5 0.39 positive
## 67 7 0 6 -1.22 negative
## 68 7 0 7 2.73 positive
## 69 7 0 8 -1.84 negative
## 70 7 0 9 -0.76 negative
## 71 8 1 0 4.51 positive
## 72 8 1 1 -0.16 negative
## 73 8 1 2 0.41 positive
## 74 8 1 3 -0.17 negative
## 75 8 1 4 2.29 positive
## 76 8 1 5 1.09 positive
## 77 8 1 6 0.98 positive
## 78 8 1 7 -1.00 negative
## 79 8 1 8 -0.41 negative
## 80 8 1 9 1.27 positive
## 81 9 0 0 -1.69 negative
## 82 9 0 1 0.72 positive
## 83 9 0 2 2.57 positive
## 84 9 0 3 0.47 positive
## 85 9 0 4 -0.37 negative
## 86 9 0 5 1.43 positive
## 87 9 0 6 0.77 positive
## 88 9 0 7 -0.68 negative
## 89 9 0 8 -0.96 negative
## 90 9 0 9 0.67 positive
## 91 10 1 0 1.52 positive
## 92 10 1 1 -2.78 negative
## 93 10 1 2 1.79 positive
## 94 10 1 3 -2.38 negative
## 95 10 1 4 0.77 positive
## 96 10 1 5 -0.36 negative
## 97 10 1 6 0.99 positive
## 98 10 1 7 0.86 positive
## 99 10 1 8 -2.51 negative
## 100 10 1 9 0.29 positive
##Here we are creating a variable indicating whether the score is positive or negative
####Let’s combine these techniques to label each person’s average score as positive or negative
dat %>%
group_by(id) %>%
mutate(mean_score = mean(score)) %>%
mutate(pos_neg_mean=ifelse(mean_score < 0, "negative", "positive"))
## # A tibble: 100 x 6
## # Groups: id [10]
## id group time score mean_score pos_neg_mean
## <int> <int> <int> <dbl> <dbl> <chr>
## 1 1 0 0 0.3 -0.381 negative
## 2 1 0 1 0.26 -0.381 negative
## 3 1 0 2 0.53 -0.381 negative
## 4 1 0 3 1.9 -0.381 negative
## 5 1 0 4 -1.77 -0.381 negative
## 6 1 0 5 -3.61 -0.381 negative
## 7 1 0 6 1.38 -0.381 negative
## 8 1 0 7 1.22 -0.381 negative
## 9 1 0 8 -2.64 -0.381 negative
## 10 1 0 9 -1.38 -0.381 negative
## # … with 90 more rows
##Here we are creating a variable indicating whether the mean score is positive or negative
####Let’s create a few more summary variables and save it as “summary_dat”
summary_dat<-dat %>%
group_by(id) %>%
mutate(mean_score = mean(score)) %>% ##mean
mutate(var_score = var(score)) %>% ##variance
mutate(min_score = min(score)) %>% ##max
mutate(max_score = max(score)) ##min
##Tidyverse part 2: Choosing specific columns (select) and rows (filter)
####Viewing our summary data, we have a lot of redundant rows containing the same info
summary_dat
## # A tibble: 100 x 8
## # Groups: id [10]
## id group time score mean_score var_score min_score max_score
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0 0 0.3 -0.381 3.45 -3.61 1.9
## 2 1 0 1 0.26 -0.381 3.45 -3.61 1.9
## 3 1 0 2 0.53 -0.381 3.45 -3.61 1.9
## 4 1 0 3 1.9 -0.381 3.45 -3.61 1.9
## 5 1 0 4 -1.77 -0.381 3.45 -3.61 1.9
## 6 1 0 5 -3.61 -0.381 3.45 -3.61 1.9
## 7 1 0 6 1.38 -0.381 3.45 -3.61 1.9
## 8 1 0 7 1.22 -0.381 3.45 -3.61 1.9
## 9 1 0 8 -2.64 -0.381 3.45 -3.61 1.9
## 10 1 0 9 -1.38 -0.381 3.45 -3.61 1.9
## # … with 90 more rows
####If we only want to view our summary variables, first we can select them
summary_dat %>% select(id, mean_score:max_score)
## # A tibble: 100 x 5
## # Groups: id [10]
## id mean_score var_score min_score max_score
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.381 3.45 -3.61 1.9
## 2 1 -0.381 3.45 -3.61 1.9
## 3 1 -0.381 3.45 -3.61 1.9
## 4 1 -0.381 3.45 -3.61 1.9
## 5 1 -0.381 3.45 -3.61 1.9
## 6 1 -0.381 3.45 -3.61 1.9
## 7 1 -0.381 3.45 -3.61 1.9
## 8 1 -0.381 3.45 -3.61 1.9
## 9 1 -0.381 3.45 -3.61 1.9
## 10 1 -0.381 3.45 -3.61 1.9
## # … with 90 more rows
##This selects id plus all columns between mean_score and max_score
####Once we have subsetted our summary columns, we can use ‘unique’ get 1 row per person
summary_dat<-summary_dat %>% select(id, mean_score:max_score) %>% unique()
summary_dat
## # A tibble: 10 x 5
## # Groups: id [10]
## id mean_score var_score min_score max_score
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.381 3.45 -3.61 1.9
## 2 2 0.737 2.40 -1.81 3.4
## 3 3 0.05 4.81 -3.32 4.49
## 4 4 0.112 1.59 -1.85 2.04
## 5 5 0.109 2.03 -2.04 2.43
## 6 6 0.0820 2.34 -2.13 2.09
## 7 7 0.0300 2.19 -1.86 2.73
## 8 8 0.881 2.55 -1 4.51
## 9 9 0.293 1.55 -1.69 2.57
## 10 10 -0.181 3.05 -2.78 1.79
##Now we have a tibble with one row of summary data per id
####Let’s say we want to redo this process, but only for people in group 1
####Starting with our original data, let’s use filter to select rows where group==1
dat %>% filter(group==1)
## id group time score
## 1 2 1 0 -0.24
## 2 2 1 1 0.23
## 3 2 1 2 2.25
## 4 2 1 3 0.66
## 5 2 1 4 1.24
## 6 2 1 5 0.92
## 7 2 1 6 1.76
## 8 2 1 7 -1.04
## 9 2 1 8 3.40
## 10 2 1 9 -1.81
## 11 4 1 0 1.20
## 12 4 1 1 -1.84
## 13 4 1 2 -1.85
## 14 4 1 3 -0.38
## 15 4 1 4 0.52
## 16 4 1 5 -0.40
## 17 4 1 6 0.78
## 18 4 1 7 2.04
## 19 4 1 8 0.89
## 20 4 1 9 0.16
## 21 6 1 0 1.97
## 22 6 1 1 -2.13
## 23 6 1 2 -0.79
## 24 6 1 3 -1.10
## 25 6 1 4 -0.11
## 26 6 1 5 -1.15
## 27 6 1 6 -0.78
## 28 6 1 7 1.95
## 29 6 1 8 2.09
## 30 6 1 9 0.87
## 31 8 1 0 4.51
## 32 8 1 1 -0.16
## 33 8 1 2 0.41
## 34 8 1 3 -0.17
## 35 8 1 4 2.29
## 36 8 1 5 1.09
## 37 8 1 6 0.98
## 38 8 1 7 -1.00
## 39 8 1 8 -0.41
## 40 8 1 9 1.27
## 41 10 1 0 1.52
## 42 10 1 1 -2.78
## 43 10 1 2 1.79
## 44 10 1 3 -2.38
## 45 10 1 4 0.77
## 46 10 1 5 -0.36
## 47 10 1 6 0.99
## 48 10 1 7 0.86
## 49 10 1 8 -2.51
## 50 10 1 9 0.29
##This results in 50 rows of data for 5 ids
####Now we can add our ‘group_by’, ‘mutate’, ‘select’, and ‘unique’ functions after the ‘filter’
group_1_summary_dat<-dat %>% filter(group==1) %>%
group_by(id) %>%
mutate(mean_score = mean(score)) %>%
mutate(var_score = var(score)) %>%
mutate(min_score = min(score)) %>%
mutate(max_score = max(score)) %>%
select(id, group, mean_score:max_score) %>%
unique()
group_1_summary_dat
## # A tibble: 5 x 6
## # Groups: id [5]
## id group mean_score var_score min_score max_score
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2 1 0.737 2.40 -1.81 3.4
## 2 4 1 0.112 1.59 -1.85 2.04
## 3 6 1 0.0820 2.34 -2.13 2.09
## 4 8 1 0.881 2.55 -1 4.51
## 5 10 1 -0.181 3.05 -2.78 1.79
##Here we have included 'group' in our select call
####Now let’s say we are only interested in obervations where the “score” variable is > 1
####We can filter our ‘dat’ tibble to only select scores > 1
dat %>% filter(score > 1)
## id group time score
## 1 1 0 3 1.90
## 2 1 0 6 1.38
## 3 1 0 7 1.22
## 4 2 1 2 2.25
## 5 2 1 4 1.24
## 6 2 1 6 1.76
## 7 2 1 8 3.40
## 8 3 0 1 2.68
## 9 3 0 3 4.49
## 10 4 1 0 1.20
## 11 4 1 7 2.04
## 12 5 0 0 2.43
## 13 5 0 5 1.20
## 14 6 1 0 1.97
## 15 6 1 7 1.95
## 16 6 1 8 2.09
## 17 7 0 0 1.22
## 18 7 0 1 1.27
## 19 7 0 7 2.73
## 20 8 1 0 4.51
## 21 8 1 4 2.29
## 22 8 1 5 1.09
## 23 8 1 9 1.27
## 24 9 0 2 2.57
## 25 9 0 5 1.43
## 26 10 1 0 1.52
## 27 10 1 2 1.79
##We have 28 obervatons of score > 1
####We can filter by both group and score; let’s subset rows where group == 0 and score is < 1
dat %>% filter(group==0) %>% filter(score < 1)
## id group time score
## 1 1 0 0 0.30
## 2 1 0 1 0.26
## 3 1 0 2 0.53
## 4 1 0 4 -1.77
## 5 1 0 5 -3.61
## 6 1 0 8 -2.64
## 7 1 0 9 -1.38
## 8 3 0 0 -1.29
## 9 3 0 2 -1.41
## 10 3 0 4 0.65
## 11 3 0 5 0.23
## 12 3 0 6 -0.43
## 13 3 0 7 -0.48
## 14 3 0 8 -0.62
## 15 3 0 9 -3.32
## 16 5 0 1 0.94
## 17 5 0 2 -2.04
## 18 5 0 3 0.33
## 19 5 0 4 -1.17
## 20 5 0 6 0.63
## 21 5 0 7 -0.15
## 22 5 0 8 -1.87
## 23 5 0 9 0.79
## 24 7 0 2 0.36
## 25 7 0 3 0.01
## 26 7 0 4 -1.86
## 27 7 0 5 0.39
## 28 7 0 6 -1.22
## 29 7 0 8 -1.84
## 30 7 0 9 -0.76
## 31 9 0 0 -1.69
## 32 9 0 1 0.72
## 33 9 0 3 0.47
## 34 9 0 4 -0.37
## 35 9 0 6 0.77
## 36 9 0 7 -0.68
## 37 9 0 8 -0.96
## 38 9 0 9 0.67
##We have 37 obervatons where group == 0 and score is < 1
##Write out data for part 3
####Finally, let’s create a new variable that we will use in part 3 of the tutorial
dat<-dat %>% mutate(score_2 = score + time*.50)
dat
## id group time score score_2
## 1 1 0 0 0.30 0.30
## 2 1 0 1 0.26 0.76
## 3 1 0 2 0.53 1.53
## 4 1 0 3 1.90 3.40
## 5 1 0 4 -1.77 0.23
## 6 1 0 5 -3.61 -1.11
## 7 1 0 6 1.38 4.38
## 8 1 0 7 1.22 4.72
## 9 1 0 8 -2.64 1.36
## 10 1 0 9 -1.38 3.12
## 11 2 1 0 -0.24 -0.24
## 12 2 1 1 0.23 0.73
## 13 2 1 2 2.25 3.25
## 14 2 1 3 0.66 2.16
## 15 2 1 4 1.24 3.24
## 16 2 1 5 0.92 3.42
## 17 2 1 6 1.76 4.76
## 18 2 1 7 -1.04 2.46
## 19 2 1 8 3.40 7.40
## 20 2 1 9 -1.81 2.69
## 21 3 0 0 -1.29 -1.29
## 22 3 0 1 2.68 3.18
## 23 3 0 2 -1.41 -0.41
## 24 3 0 3 4.49 5.99
## 25 3 0 4 0.65 2.65
## 26 3 0 5 0.23 2.73
## 27 3 0 6 -0.43 2.57
## 28 3 0 7 -0.48 3.02
## 29 3 0 8 -0.62 3.38
## 30 3 0 9 -3.32 1.18
## 31 4 1 0 1.20 1.20
## 32 4 1 1 -1.84 -1.34
## 33 4 1 2 -1.85 -0.85
## 34 4 1 3 -0.38 1.12
## 35 4 1 4 0.52 2.52
## 36 4 1 5 -0.40 2.10
## 37 4 1 6 0.78 3.78
## 38 4 1 7 2.04 5.54
## 39 4 1 8 0.89 4.89
## 40 4 1 9 0.16 4.66
## 41 5 0 0 2.43 2.43
## 42 5 0 1 0.94 1.44
## 43 5 0 2 -2.04 -1.04
## 44 5 0 3 0.33 1.83
## 45 5 0 4 -1.17 0.83
## 46 5 0 5 1.20 3.70
## 47 5 0 6 0.63 3.63
## 48 5 0 7 -0.15 3.35
## 49 5 0 8 -1.87 2.13
## 50 5 0 9 0.79 5.29
## 51 6 1 0 1.97 1.97
## 52 6 1 1 -2.13 -1.63
## 53 6 1 2 -0.79 0.21
## 54 6 1 3 -1.10 0.40
## 55 6 1 4 -0.11 1.89
## 56 6 1 5 -1.15 1.35
## 57 6 1 6 -0.78 2.22
## 58 6 1 7 1.95 5.45
## 59 6 1 8 2.09 6.09
## 60 6 1 9 0.87 5.37
## 61 7 0 0 1.22 1.22
## 62 7 0 1 1.27 1.77
## 63 7 0 2 0.36 1.36
## 64 7 0 3 0.01 1.51
## 65 7 0 4 -1.86 0.14
## 66 7 0 5 0.39 2.89
## 67 7 0 6 -1.22 1.78
## 68 7 0 7 2.73 6.23
## 69 7 0 8 -1.84 2.16
## 70 7 0 9 -0.76 3.74
## 71 8 1 0 4.51 4.51
## 72 8 1 1 -0.16 0.34
## 73 8 1 2 0.41 1.41
## 74 8 1 3 -0.17 1.33
## 75 8 1 4 2.29 4.29
## 76 8 1 5 1.09 3.59
## 77 8 1 6 0.98 3.98
## 78 8 1 7 -1.00 2.50
## 79 8 1 8 -0.41 3.59
## 80 8 1 9 1.27 5.77
## 81 9 0 0 -1.69 -1.69
## 82 9 0 1 0.72 1.22
## 83 9 0 2 2.57 3.57
## 84 9 0 3 0.47 1.97
## 85 9 0 4 -0.37 1.63
## 86 9 0 5 1.43 3.93
## 87 9 0 6 0.77 3.77
## 88 9 0 7 -0.68 2.82
## 89 9 0 8 -0.96 3.04
## 90 9 0 9 0.67 5.17
## 91 10 1 0 1.52 1.52
## 92 10 1 1 -2.78 -2.28
## 93 10 1 2 1.79 2.79
## 94 10 1 3 -2.38 -0.88
## 95 10 1 4 0.77 2.77
## 96 10 1 5 -0.36 2.14
## 97 10 1 6 0.99 3.99
## 98 10 1 7 0.86 4.36
## 99 10 1 8 -2.51 1.49
## 100 10 1 9 0.29 4.79
####Let’s write out the data
getwd()
## [1] "/Users/Patrick/Documents/New/content/post"
##This shows you your current working directory
##This is where the data will be written to
write.csv(dat, file="Tidy2_dat.csv", row.names = FALSE)