#HomeWork ## Unit Project 02 - L06: ggplot2
penguins <- penguins %>%
drop_na(flipper_length_mm, body_mass_g, bill_length_mm, bill_depth_mm, sex)
There is a little difference in the background, for example the shade.
# Q1
ggplot(data = penguins, aes(x = island)) +
geom_bar(fill = "blue") +
labs(title = "Bar plot of Number counts for each island", x = "differnt islands", y = "Count") +
theme_minimal(base_size = 17)
# here do not need to assign y axis because ggplot automatically counts observations and scales the y-axis based on the maximum count in the data.
ggplot(data = penguins, aes(x = island)) +
geom_bar() +
labs()
The general shape of the distribution is normal. With bins increase, the separate bar’s number is also increasing, from 10 to 30 to 60. More detail showed with bins increase. The distribution of different species are differnt, especially for Gentoo, it seems like bimodal distribution.
ggplot(data = penguins, aes(x = body_mass_g)) +
geom_histogram(bins = 10,
fill = "blue",
color = "black") +
theme_minimal()
ggplot(data = penguins, aes(x = body_mass_g)) +
geom_histogram(bins = 30,
fill = "blue",
color = "black") +
theme_minimal()
ggplot(data = penguins, aes(x = body_mass_g)) +
geom_histogram(bins = 60,
fill = "blue",
color = "black") +
theme_minimal()
##
ggplot(data = penguins, aes(x = body_mass_g)) +
geom_histogram(bins = 60,
fill = "blue",
color = "black") +
facet_wrap(~ species) +
theme_minimal()
The body mass of Gentoo is much higher than Adelie and Chinstrap, for the body_mass_g, male and females differ. After adding fill, the readability increased.
ggplot(data = penguins,
aes(x = species, y = flipper_length_mm)) +
geom_boxplot() +
theme_minimal()
ggplot(data = penguins,
aes(x = species, y = flipper_length_mm, fill = species)) +
geom_boxplot() +
theme_minimal()
ggplot(data = penguins,
aes(x = sex, y = body_mass_g, fill = species)) +
geom_boxplot() +
theme_minimal()
The relationship is hard to tell if not using the geom_smooth, but if using it, we can see all the relationship is positive. By adding color to the chart, it makes different species more recognizable Yes, the line fit the data well, it is showing the trend.
ggplot(penguins,
aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
geom_point() +
theme_minimal()
# No geom_scatter, only geom_point
ggplot(penguins,
aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
geom_point() +
geom_smooth(method = "lm") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Genoo shows strongest relationship, with flipper length increase, body mass increase facet_grid(sex ~ species) making graph showing in distinct small graph, which is much clear to see the difference.
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point() +
theme_minimal()
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point() +
facet_grid(sex~species) +
theme_minimal()
ggplot(penguins, aes(x = sex, y = body_mass_g, color = species)) +
geom_jitter() +
facet_wrap(~species) +
labs(y = "body mass (g)", subtitle = "2026-Feb-6") +
theme_minimal()
Draw the diagram is really interesting and how different command can lead to different graphs. easy and strait forward command, really concise and easy to use. Difficulty thing is when typo happens to the functions, sometimes really hard to quickly discover wrote it wrong.
Regex metacharacter ^$*+?()[]{}|\
Rule 1
x <- c("cat", "caterpillar", "dog")
str_detect(x, "cat")
## [1] TRUE TRUE FALSE
Rule 2: Ranges square brackets define set/ranges of characters
[A-Z] any uppercase letter [a-z] any
lowercase letter [0-9] any number
x <- c("Abandon","Abstract", "Advice", "advice")
str_detect(x, "[A-Z]")
## [1] TRUE TRUE TRUE FALSE
Rule 3: Anchors ^ matches the START of a string
x <- c("Absolute", "basement")
str_detect(x, "^a")
## [1] FALSE FALSE
Rule 4: wildcards matches any one character . matches
any one character
Example
x <- c("Absolute", "basement")
str_detect(x, ".a")
## [1] FALSE TRUE
Rule 7: Quantifiers ? 0 or 1 * 0 or more
+ 1 or more {m} exactly m times
Examples
x <- c("ct", "cat", "caat", "caaat", "cart", "ate", "cave", "caave")
str_detect(x, "ca?t")
## [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
str_detect(x, "ca*t")
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
str_detect(x, "ca+t")
## [1] FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
str_detect(x, "ca{2}t")
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
str_detect(x, "a.*t")
## [1] FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
str_detect(x, "ca.*t")
## [1] FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
str_detect(x, "ca*t")
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
Rule 8: Escaping to literally match a metacharacter in a string add a leading “\”
Example
x <- c("Study.csv", "Work.csv", "123..csv")
str_detect(x, "S*.csv")
## [1] TRUE TRUE TRUE
str_detect(x, "S*.\\.csv")
## [1] TRUE TRUE TRUE
str_detect(x, "S*.\\.csv$")
## [1] TRUE TRUE TRUE
Rule 9: Shorthands \\d digits (0, 9) \\s
whitespace (spaces, tabs) \\w words [A-Z, a-z, 0-9, _ ]
\\b word boundaries
Examples
x <- c("cat", "a cat", "cat!", "scatter")
# For `\\w`
str_detect(x, "\\w")
## [1] TRUE TRUE TRUE TRUE
str_detect(x, "\\w+")
## [1] TRUE TRUE TRUE TRUE
str_detect(x, "\\w+$")
## [1] TRUE TRUE FALSE TRUE
str_detect(x, "^\\w+$")
## [1] TRUE FALSE FALSE TRUE
#For `\\b` word boundaries
str_detect(x, "\\bcat\\b")
## [1] TRUE TRUE TRUE FALSE
str_replace() str_replace_all()
penguins_messy <- read.csv("penguins_messy.csv")
# Q1
str_detect(penguins_messy$species_messy, "adelie")
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# This command is trying to find if the str name is "adelie"
str_replace_all(penguins_messy$sex_messy, "\\?", "")
## [1] "M" "f" "f" NA "f" "M" "f" "M" NA NA NA NA "f" "M" "M" "f" "f" "M"
## [19] "f" "M" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M"
## [37] "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "M" NA "f" "M" "f" "M" "f" "M"
## [55] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [73] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "M" "f"
## [91] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [109] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [127] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [145] "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "f" "M" "f" "M"
## [163] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" NA "M"
## [181] "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f"
## [199] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [217] "f" "M" NA "M" "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M"
## [235] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "f" "M"
## [253] "f" "M" "f" "M" NA "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" NA "M"
## [271] "f" NA "f" "M" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M" "f" "M"
## [289] "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [307] "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M"
## [325] "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M"
## [343] "M" "f"
x <- c("AA12", "AA001")
str_detect(x, "^[A-Z]{2}\\d+")
## [1] TRUE TRUE
penguins_messy <- read.csv("penguins_messy.csv")
in the penguins_messy data, species_messy column, trying to find if any name is “adelie”, if yes, output TRUE, other output FALSE in the penguins_messy data, sex_messy column, trying to find if any value is “?”, if yes, replace with blank in the penguins_messy data, island_messy column, remove all repeated white space,trim into one space.
str_detect(penguins_messy$species_messy, "adelie")
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#in the penguins_messy data, species_messy column, trying to find if any name is "adelie", if yes, output TRUE, other output FALSE
str_replace_all(penguins_messy$sex_messy, "\\?", " ")
## [1] "M" "f " "f " NA "f " "M" "f " "M" NA NA NA NA "f " "M" "M"
## [16] "f " "f " "M" "f " "M" "f " "M" "f " "M" "M" "f " "M" "f " "f " "M"
## [31] "f " "M" "f " "M" "f " "M" "M" "f " "f " "M" "f " "M" "f " "M" "f "
## [46] "M" "M" NA "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M"
## [61] "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f "
## [76] "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "M" "f " "M" "f "
## [91] "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f "
## [106] "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M"
## [121] "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f "
## [136] "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "M" "f " "f " "M"
## [151] "f " "M" "f " "M" "f " "M" "M" "f " "f " "M" "f " "M" "f " "M" "f "
## [166] "M" "f " "M" "f " "M" "f " "M" "M" "f " "f " "M" "f " "M" NA "M"
## [181] "f " "M" "M" "f " "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f "
## [196] "M" "M" "f " "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M"
## [211] "f " "M" "f " "M" "f " "M" "f " "M" NA "M" "f " "M" "f " "M" "M"
## [226] "f " "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M"
## [241] "f " "M" "f " "M" "f " "M" "f " "M" "M" "f " "f " "M" "f " "M" "f "
## [256] "M" NA "M" "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" NA "M"
## [271] "f " NA "f " "M" "f " "M" "f " "M" "M" "f " "M" "f " "f " "M" "f "
## [286] "M" "f " "M" "f " "M" "f " "M" "M" "f " "f " "M" "f " "M" "f " "M"
## [301] "f " "M" "f " "M" "f " "M" "f " "M" "f " "M" "M" "f " "f " "M" "f "
## [316] "M" "M" "f " "M" "f " "f " "M" "f " "M" "M" "f " "f " "M" "f " "M"
## [331] "f " "M" "f " "M" "M" "f " "M" "f " "f " "M" "f " "M" "M" "f "
#in the penguins_messy data, sex_messy column, trying to find if any value is "?", if yes, replace with blank
str_squish(penguins_messy$island_messy)
## [1] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [7] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [13] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [19] "Torgersen" "Torgersen" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [25] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [31] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [37] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [43] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [49] "Dream" "Dream" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [55] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [61] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [67] "BIS-COE" "BIS-COE" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [73] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [79] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [85] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [91] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [97] "Dream" "Dream" "Dream" "Dream" "BIS-COE" "BIS-COE"
## [103] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [109] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [115] "BIS-COE" "BIS-COE" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [121] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [127] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [133] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [139] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [145] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [151] "Dream" "Dream" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [157] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [163] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [169] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [175] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [181] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [187] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [193] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [199] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [205] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [211] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [217] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [223] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [229] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [235] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [241] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [247] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [253] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [259] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [265] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [271] "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE" "BIS-COE"
## [277] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [283] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [289] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [295] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [301] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [307] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [313] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [319] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [325] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [331] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [337] "Dream" "Dream" "Dream" "Dream" "Dream" "Dream"
## [343] "Dream" "Dream"
#in the penguins_messy data, island_messy column, remove all repeated white space,trim into one space.
#str_detect(penguins_messy$sample_id, "PAL-[0-9]{3}")
penguins_messy <- penguins_messy %>%
mutate(valid_id = str_detect(penguins_messy$sample_id, "PAL-[0-9]{3}"))
penguins_messy %>%
pull(valid_id) %>%
sum
## [1] 316
penguins_messy <- penguins_messy%>%
filter(valid_id == TRUE)
penguins_messy %>%
pull(valid_id) %>%
sum
## [1] 316
penguins_messy <- penguins_messy %>%
mutate(file_date = str_extract(file_name, "^[0-9]{4}-[0-9]{2}-[0-9]{2}")) %>%
mutate(file_id = str_extract(sample_id, "PAL-[0-9]{3}"))
head(penguins_messy)
## species_messy island_messy sex_messy sample_id
## 1 adelie Torgersen\t M PAL-535
## 2 adelie Torgersen\t f? PAL-442
## 3 adelie Torgersen\t f? PAL-668
## 4 adelie Torgersen\t <NA> PAL-920
## 5 adelie Torgersen\t f? PAL-223
## 6 adelie Torgersen\t M PAL-899
## file_name
## 1 2024-01-14_Torgersen\t_adelie_PAL-535.csv
## 2 2024-01-17_Torgersen\t_adelie_PAL-442.csv
## 3 2024-01-08_Torgersen\t_adelie_PAL-668.csv
## 4 2024-01-26_Torgersen\t_adelie_PAL-920.csv
## 5 2024-01-06_Torgersen\t_adelie_PAL-223.csv
## 6 2024-01-08_Torgersen\t_adelie_PAL-899.csv
## notes valid_id file_date
## 1 id=PAL-535; island=Torgersen\t; sp=adelie; bill=39.1mm TRUE 2024-01-14
## 2 id=PAL-442; island=Torgersen\t; sp=adelie; bill=39.5mm TRUE 2024-01-17
## 3 id=PAL-668; island=Torgersen\t; sp=adelie; bill=40.3mm TRUE 2024-01-08
## 4 id=PAL-920; island=Torgersen\t; sp=adelie; bill=NAmm TRUE 2024-01-26
## 5 id=PAL-223; island=Torgersen\t; sp=adelie; bill=36.7mm TRUE 2024-01-06
## 6 id=PAL-899; island=Torgersen\t; sp=adelie; bill=39.3mm TRUE 2024-01-08
## file_id
## 1 PAL-535
## 2 PAL-442
## 3 PAL-668
## 4 PAL-920
## 5 PAL-223
## 6 PAL-899
penguins_messy <- penguins_messy %>%
mutate(island = str_squish(island_messy))
unique(penguins_messy$island)
## [1] "Torgersen" "BIS-COE" "Dream"
penguins_messy <- penguins_messy %>%
mutate(sex = sex_messy %>%
str_replace_all( "\\?", "") %>%
str_replace_all("m", "M") %>%
str_replace_all("f", "F")
)
unique(penguins_messy$sex)
## [1] "M" "F" NA
penguins_messy <- penguins_messy %>%
mutate(
bill_mm = notes %>%
str_replace_all("(.*bill=)|mm", "") %>%
as.numeric()
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `bill_mm = notes %>% str_replace_all("(.*bill=)|mm", "") %>%
## as.numeric()`.
## Caused by warning in `notes %>% str_replace_all("(.*bill=)|mm", "") %>% as.numeric()`:
## ! NAs introduced by coercion
amazing about the Regular Expressions can quickly extract things we want. Yes I think I have a good understanding about how to using it,
Terms: drop_na drops rows where any column specified
contains a missing value distinct() only keeps one copy of
rows that have exact duplicates in the data frame
where(is.type) select columns by type or property, making
code robust to new columns.
select() + stats_with("string")
select() + ends_with("string")
select() + contains("string")
select() + mathces("regex")
across() calculate for each value
i.e. summarize(across(where(is.numeric), mean)
left_join(x, y, by = "column name")
~ if ture, print
x <- c(50, 70, 90)
case_when(
x <= 50 ~ "F",
x <= 80 ~ "B",
x >= 90 ~ "A"
)
## [1] "F" "B" "A"
data.frame(a = c(1,2, 3)) %>%
mutate(b = if_else(a > 2, TRUE, FALSE))
## a b
## 1 1 FALSE
## 2 2 FALSE
## 3 3 TRUE
data.frame(a = c(1,2, 3)) %>%
mutate(b = if_else(a > 2, FALSE, TRUE))
## a b
## 1 1 TRUE
## 2 2 TRUE
## 3 3 FALSE
data.frame(a = c(1,2, 3)) %>%
mutate(b = if_else(a > 2, 1, 2))
## a b
## 1 1 2
## 2 2 2
## 3 3 1
penguins %>%
select()
## # A tibble: 333 × 0
penguins %>%
drop_na %>%
group_by(species) %>%
summarize(across(where(is.numeric), mean))
## # A tibble: 3 × 6
## species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 18.3 190. 3706. 2008.
## 2 Chinstrap 48.8 18.4 196. 3733. 2008.
## 3 Gentoo 47.6 15.0 217. 5092. 2008.
penguins%>%
mutate(
heavy = if_else(body_mass_g >= 4500, "heavy", "light")) %>%
select(body_mass_g, heavy)
## # A tibble: 333 × 2
## body_mass_g heavy
## <int> <chr>
## 1 3750 light
## 2 3800 light
## 3 3250 light
## 4 3450 light
## 5 3650 light
## 6 3625 light
## 7 4675 heavy
## 8 3200 light
## 9 3800 light
## 10 4400 light
## # ℹ 323 more rows
There is original 344 rows, after clean there is 333 rows.
library("palmerpenguins")
penguins_clean <- penguins %>%
drop_na(bill_length_mm, sex)
nrow(penguins)
## [1] 333
nrow(penguins_clean)
## [1] 333
because repeated data are able to influence means and distribution
penguins_dup <- penguins_clean %>%
bind_rows(
slice_sample(penguins_clean, n = 30, replace = TRUE)
)
nrow(penguins_dup)
## [1] 363
There is eight resulting data frame.
penguins_selected <- penguins_clean %>%
select(species, island, sex, where(is.numeric)
)
ncol(penguins_selected)
## [1] 8
penguins_clean %>%
select(starts_with("bill"))
## # A tibble: 333 × 2
## bill_length_mm bill_depth_mm
## <dbl> <dbl>
## 1 39.1 18.7
## 2 39.5 17.4
## 3 40.3 18
## 4 36.7 19.3
## 5 39.3 20.6
## 6 38.9 17.8
## 7 39.2 19.6
## 8 41.1 17.6
## 9 38.6 21.2
## 10 34.6 21.1
## # ℹ 323 more rows
penguins_clean %>%
select(ends_with("_mm"))
## # A tibble: 333 × 3
## bill_length_mm bill_depth_mm flipper_length_mm
## <dbl> <dbl> <int>
## 1 39.1 18.7 181
## 2 39.5 17.4 186
## 3 40.3 18 195
## 4 36.7 19.3 193
## 5 39.3 20.6 190
## 6 38.9 17.8 181
## 7 39.2 19.6 195
## 8 41.1 17.6 182
## 9 38.6 21.2 191
## 10 34.6 21.1 198
## # ℹ 323 more rows
penguins_clean %>%
select(matches("^(bill|flipper)"))
## # A tibble: 333 × 3
## bill_length_mm bill_depth_mm flipper_length_mm
## <dbl> <dbl> <int>
## 1 39.1 18.7 181
## 2 39.5 17.4 186
## 3 40.3 18 195
## 4 36.7 19.3 193
## 5 39.3 20.6 190
## 6 38.9 17.8 181
## 7 39.2 19.6 195
## 8 41.1 17.6 182
## 9 38.6 21.2 191
## 10 34.6 21.1 198
## # ℹ 323 more rows
more concise, and less likely to type wrong.
penguins_clean %>%
group_by(species) %>%
summarize(across(where(is.numeric), mean))
## # A tibble: 3 × 6
## species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 18.3 190. 3706. 2008.
## 2 Chinstrap 48.8 18.4 196. 3733. 2008.
## 3 Gentoo 47.6 15.0 217. 5092. 2008.
penguins_clean <- penguins_clean %>%
mutate(
bill_length = case_when(
bill_length_mm < 40 ~"short",
bill_length_mm < 50 ~"medium",
bill_length_mm >= 50 ~"long"
)
)
head(penguins_clean$bill_length)
## [1] "short" "short" "medium" "short" "short" "short"
The number of penguins_instrumented is 12, and penguins_clean is 9
instrument_meta <- tibble(
instrument_id = c("caliper_A", "caliper_B", "caliper_C"),
calibration_mm = c(0.2, 0.5, 1.0),
manufacturer = c("Mitutoyo", "Fowler", "Generic")
)
penguins_instrumented <- penguins_clean %>%
drop_na(bill_length_mm) %>%
mutate(
instrument_id = case_when(
year == 2007 ~ "caliper_A",
year == 2008 ~ "caliper_B",
year == 2009 ~ "caliper_C"
)
)
penguins_instrumented <- penguins_instrumented %>%
left_join(instrument_meta, by = "instrument_id")
ncol(penguins_clean)
## [1] 9
ncol(penguins_instrumented)
## [1] 12
They are helpful, especially for the left joint. I face simiar question before in data analysis.