#HomeWork ## Unit Project 02 - L06: ggplot2

penguins <- penguins %>%
  drop_na(flipper_length_mm, body_mass_g, bill_length_mm, bill_depth_mm, sex)

Q1

There is a little difference in the background, for example the shade.

# Q1
ggplot(data = penguins, aes(x = island)) + 
  geom_bar(fill = "blue") + 
  labs(title = "Bar plot of Number counts for each island", x = "differnt islands", y = "Count") +
  theme_minimal(base_size = 17)

# here do not need to assign y axis because ggplot automatically counts observations and scales the y-axis based on the maximum count in the data.

ggplot(data = penguins, aes(x = island)) +
  geom_bar() + 
  labs()

Q2

The general shape of the distribution is normal. With bins increase, the separate bar’s number is also increasing, from 10 to 30 to 60. More detail showed with bins increase. The distribution of different species are differnt, especially for Gentoo, it seems like bimodal distribution.

ggplot(data = penguins, aes(x = body_mass_g)) + 
  geom_histogram(bins = 10, 
                 fill = "blue", 
                 color = "black") +
  theme_minimal()

ggplot(data = penguins, aes(x = body_mass_g)) + 
  geom_histogram(bins = 30, 
                 fill = "blue", 
                 color = "black") +
  theme_minimal()

ggplot(data = penguins, aes(x = body_mass_g)) + 
  geom_histogram(bins = 60, 
                 fill = "blue", 
                 color = "black") +
  theme_minimal()

##


ggplot(data = penguins, aes(x = body_mass_g)) + 
  geom_histogram(bins = 60, 
                 fill = "blue", 
                 color = "black") +
  facet_wrap(~ species) +
  theme_minimal()

Q3

The body mass of Gentoo is much higher than Adelie and Chinstrap, for the body_mass_g, male and females differ. After adding fill, the readability increased.

ggplot(data = penguins, 
       aes(x = species, y = flipper_length_mm)) + 
  geom_boxplot() +
  theme_minimal()

ggplot(data = penguins, 
       aes(x = species, y = flipper_length_mm, fill = species)) + 
  geom_boxplot() +
  theme_minimal()

ggplot(data = penguins, 
       aes(x = sex, y = body_mass_g, fill = species)) + 
  geom_boxplot() +
  theme_minimal()

Q4

The relationship is hard to tell if not using the geom_smooth, but if using it, we can see all the relationship is positive. By adding color to the chart, it makes different species more recognizable Yes, the line fit the data well, it is showing the trend.

ggplot(penguins, 
       aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
  geom_point()  + 
  theme_minimal()

# No geom_scatter, only geom_point

ggplot(penguins, 
       aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
  geom_point()  + 
  geom_smooth(method = "lm") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Q5

Genoo shows strongest relationship, with flipper length increase, body mass increase facet_grid(sex ~ species) making graph showing in distinct small graph, which is much clear to see the difference.

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) + 
  geom_point() + 
  theme_minimal()

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) + 
  geom_point() + 
  facet_grid(sex~species) +
  theme_minimal()

Q6

ggplot(penguins, aes(x = sex, y = body_mass_g, color = species)) + 
  geom_jitter() + 
  facet_wrap(~species) +
  labs(y = "body mass (g)", subtitle = "2026-Feb-6") +
  theme_minimal()

Q7

Draw the diagram is really interesting and how different command can lead to different graphs. easy and strait forward command, really concise and easy to use. Difficulty thing is when typo happens to the functions, sometimes really hard to quickly discover wrote it wrong.

Lec 7

Unit Project 02 - L07: Regular Expressions

Regular Expressions (Regex)

Regex metacharacter ^$*+?()[]{}|\

Rule 1

x <- c("cat", "caterpillar", "dog")
str_detect(x, "cat")

## [1]  TRUE  TRUE FALSE

Rule 2: Ranges square brackets define set/ranges of characters

[A-Z] any uppercase letter [a-z] any lowercase letter [0-9] any number

x <- c("Abandon","Abstract", "Advice", "advice")
str_detect(x, "[A-Z]")

## [1]  TRUE  TRUE  TRUE FALSE

Rule 3: Anchors ^ matches the START of a string

x <- c("Absolute", "basement")
str_detect(x, "^a")

## [1] FALSE FALSE

Rule 4: wildcards matches any one character . matches any one character

Example

x <- c("Absolute", "basement")
str_detect(x, ".a")

## [1] FALSE  TRUE

Rule 7: Quantifiers ? 0 or 1 * 0 or more + 1 or more {m} exactly m times

Examples

x <- c("ct", "cat", "caat", "caaat", "cart", "ate", "cave", "caave")
str_detect(x, "ca?t")

## [1]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE

str_detect(x, "ca*t")

## [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE

str_detect(x, "ca+t")

## [1] FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE

str_detect(x, "ca{2}t")

## [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE

str_detect(x, "a.*t")

## [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE

str_detect(x, "ca.*t")

## [1] FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE

str_detect(x, "ca*t")

## [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE

Rule 8: Escaping to literally match a metacharacter in a string add a leading “\”

Example

x <- c("Study.csv", "Work.csv", "123..csv")
str_detect(x, "S*.csv")

## [1] TRUE TRUE TRUE

str_detect(x, "S*.\\.csv")

## [1] TRUE TRUE TRUE

str_detect(x, "S*.\\.csv$")

## [1] TRUE TRUE TRUE

Rule 9: Shorthands \\d digits (0, 9) \\s whitespace (spaces, tabs) \\w words [A-Z, a-z, 0-9, _ ] \\b word boundaries

Examples

x <- c("cat", "a cat", "cat!", "scatter") 


# For `\\w`
str_detect(x, "\\w")

## [1] TRUE TRUE TRUE TRUE

str_detect(x, "\\w+")

## [1] TRUE TRUE TRUE TRUE

str_detect(x, "\\w+$")

## [1]  TRUE  TRUE FALSE  TRUE

str_detect(x, "^\\w+$")

## [1]  TRUE FALSE FALSE  TRUE

#For `\\b` word boundaries
str_detect(x, "\\bcat\\b")

## [1]  TRUE  TRUE  TRUE FALSE

str_replace() str_replace_all()

Exercise

penguins_messy <- read.csv("penguins_messy.csv")


# Q1 
str_detect(penguins_messy$species_messy, "adelie")

##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

# This command is trying to find if the str name is "adelie"
str_replace_all(penguins_messy$sex_messy, "\\?", "")

##   [1] "M" "f" "f" NA  "f" "M" "f" "M" NA  NA  NA  NA  "f" "M" "M" "f" "f" "M"
##  [19] "f" "M" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M"
##  [37] "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "M" NA  "f" "M" "f" "M" "f" "M"
##  [55] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
##  [73] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "M" "f"
##  [91] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [109] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [127] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [145] "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "f" "M" "f" "M"
## [163] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" NA  "M"
## [181] "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f"
## [199] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [217] "f" "M" NA  "M" "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M"
## [235] "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "f" "M"
## [253] "f" "M" "f" "M" NA  "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" NA  "M"
## [271] "f" NA  "f" "M" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M" "f" "M"
## [289] "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M" "f" "M"
## [307] "f" "M" "f" "M" "M" "f" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M"
## [325] "M" "f" "f" "M" "f" "M" "f" "M" "f" "M" "M" "f" "M" "f" "f" "M" "f" "M"
## [343] "M" "f"

x <- c("AA12", "AA001")
str_detect(x, "^[A-Z]{2}\\d+")

## [1] TRUE TRUE

Homework

Unit Project 02 - L07: Regular Expressions

penguins_messy <- read.csv("penguins_messy.csv")

Q1

in the penguins_messy data, species_messy column, trying to find if any name is “adelie”, if yes, output TRUE, other output FALSE in the penguins_messy data, sex_messy column, trying to find if any value is “?”, if yes, replace with blank in the penguins_messy data, island_messy column, remove all repeated white space，trim into one space.

str_detect(penguins_messy$species_messy, "adelie")

##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

#in the penguins_messy data, species_messy column, trying to find if any name is "adelie", if yes, output TRUE, other output FALSE

str_replace_all(penguins_messy$sex_messy, "\\?", " ")

##   [1] "M"  "f " "f " NA   "f " "M"  "f " "M"  NA   NA   NA   NA   "f " "M"  "M" 
##  [16] "f " "f " "M"  "f " "M"  "f " "M"  "f " "M"  "M"  "f " "M"  "f " "f " "M" 
##  [31] "f " "M"  "f " "M"  "f " "M"  "M"  "f " "f " "M"  "f " "M"  "f " "M"  "f "
##  [46] "M"  "M"  NA   "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M" 
##  [61] "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f "
##  [76] "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "M"  "f " "M"  "f "
##  [91] "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f "
## [106] "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M" 
## [121] "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f "
## [136] "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "M"  "f " "f " "M" 
## [151] "f " "M"  "f " "M"  "f " "M"  "M"  "f " "f " "M"  "f " "M"  "f " "M"  "f "
## [166] "M"  "f " "M"  "f " "M"  "f " "M"  "M"  "f " "f " "M"  "f " "M"  NA   "M" 
## [181] "f " "M"  "M"  "f " "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f "
## [196] "M"  "M"  "f " "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M" 
## [211] "f " "M"  "f " "M"  "f " "M"  "f " "M"  NA   "M"  "f " "M"  "f " "M"  "M" 
## [226] "f " "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M" 
## [241] "f " "M"  "f " "M"  "f " "M"  "f " "M"  "M"  "f " "f " "M"  "f " "M"  "f "
## [256] "M"  NA   "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  NA   "M" 
## [271] "f " NA   "f " "M"  "f " "M"  "f " "M"  "M"  "f " "M"  "f " "f " "M"  "f "
## [286] "M"  "f " "M"  "f " "M"  "f " "M"  "M"  "f " "f " "M"  "f " "M"  "f " "M" 
## [301] "f " "M"  "f " "M"  "f " "M"  "f " "M"  "f " "M"  "M"  "f " "f " "M"  "f "
## [316] "M"  "M"  "f " "M"  "f " "f " "M"  "f " "M"  "M"  "f " "f " "M"  "f " "M" 
## [331] "f " "M"  "f " "M"  "M"  "f " "M"  "f " "f " "M"  "f " "M"  "M"  "f "

#in the penguins_messy data, sex_messy column, trying to find if any value is "?", if yes, replace with blank 

str_squish(penguins_messy$island_messy)

##   [1] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
##   [7] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
##  [13] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
##  [19] "Torgersen" "Torgersen" "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
##  [25] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
##  [31] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
##  [37] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
##  [43] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
##  [49] "Dream"     "Dream"     "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
##  [55] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
##  [61] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
##  [67] "BIS-COE"   "BIS-COE"   "Torgersen" "Torgersen" "Torgersen" "Torgersen"
##  [73] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
##  [79] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
##  [85] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
##  [91] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
##  [97] "Dream"     "Dream"     "Dream"     "Dream"     "BIS-COE"   "BIS-COE"  
## [103] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [109] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [115] "BIS-COE"   "BIS-COE"   "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [121] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [127] "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen" "Torgersen"
## [133] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [139] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [145] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [151] "Dream"     "Dream"     "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [157] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [163] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [169] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [175] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [181] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [187] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [193] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [199] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [205] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [211] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [217] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [223] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [229] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [235] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [241] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [247] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [253] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [259] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [265] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [271] "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"   "BIS-COE"  
## [277] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [283] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [289] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [295] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [301] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [307] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [313] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [319] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [325] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [331] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [337] "Dream"     "Dream"     "Dream"     "Dream"     "Dream"     "Dream"    
## [343] "Dream"     "Dream"

#in the penguins_messy data, island_messy column, remove all repeated white space，trim into one space.

Q2

#str_detect(penguins_messy$sample_id, "PAL-[0-9]{3}")
penguins_messy <- penguins_messy %>% 
  mutate(valid_id = str_detect(penguins_messy$sample_id, "PAL-[0-9]{3}"))

penguins_messy %>%
  pull(valid_id) %>%
  sum

## [1] 316

Q3

penguins_messy <- penguins_messy%>% 
  filter(valid_id == TRUE)


penguins_messy %>%
  pull(valid_id) %>%
  sum

## [1] 316

Q4

penguins_messy <- penguins_messy %>% 
  mutate(file_date = str_extract(file_name, "^[0-9]{4}-[0-9]{2}-[0-9]{2}")) %>%
  mutate(file_id = str_extract(sample_id, "PAL-[0-9]{3}"))

head(penguins_messy)

##   species_messy island_messy sex_messy sample_id
## 1        adelie  Torgersen\t         M   PAL-535
## 2        adelie  Torgersen\t        f?   PAL-442
## 3        adelie  Torgersen\t        f?   PAL-668
## 4        adelie  Torgersen\t      <NA>   PAL-920
## 5        adelie  Torgersen\t        f?   PAL-223
## 6        adelie  Torgersen\t         M   PAL-899
##                                   file_name
## 1 2024-01-14_Torgersen\t_adelie_PAL-535.csv
## 2 2024-01-17_Torgersen\t_adelie_PAL-442.csv
## 3 2024-01-08_Torgersen\t_adelie_PAL-668.csv
## 4 2024-01-26_Torgersen\t_adelie_PAL-920.csv
## 5 2024-01-06_Torgersen\t_adelie_PAL-223.csv
## 6 2024-01-08_Torgersen\t_adelie_PAL-899.csv
##                                                    notes valid_id  file_date
## 1 id=PAL-535; island=Torgersen\t; sp=adelie; bill=39.1mm     TRUE 2024-01-14
## 2 id=PAL-442; island=Torgersen\t; sp=adelie; bill=39.5mm     TRUE 2024-01-17
## 3 id=PAL-668; island=Torgersen\t; sp=adelie; bill=40.3mm     TRUE 2024-01-08
## 4   id=PAL-920; island=Torgersen\t; sp=adelie; bill=NAmm     TRUE 2024-01-26
## 5 id=PAL-223; island=Torgersen\t; sp=adelie; bill=36.7mm     TRUE 2024-01-06
## 6 id=PAL-899; island=Torgersen\t; sp=adelie; bill=39.3mm     TRUE 2024-01-08
##   file_id
## 1 PAL-535
## 2 PAL-442
## 3 PAL-668
## 4 PAL-920
## 5 PAL-223
## 6 PAL-899

Q5

penguins_messy <- penguins_messy %>% 
  mutate(island = str_squish(island_messy))

unique(penguins_messy$island)

## [1] "Torgersen" "BIS-COE"   "Dream"

Q6

penguins_messy <- penguins_messy %>% 
  mutate(sex = sex_messy %>% 
           str_replace_all( "\\?", "") %>% 
           str_replace_all("m", "M") %>% 
           str_replace_all("f", "F")
         )

unique(penguins_messy$sex)

## [1] "M" "F" NA

Q7

penguins_messy <- penguins_messy %>% 
  mutate(
    bill_mm = notes %>% 
      str_replace_all("(.*bill=)|mm", "") %>%
      as.numeric()
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `bill_mm = notes %>% str_replace_all("(.*bill=)|mm", "") %>%
##   as.numeric()`.
## Caused by warning in `notes %>% str_replace_all("(.*bill=)|mm", "") %>% as.numeric()`:
## ! NAs introduced by coercion

Q8

amazing about the Regular Expressions can quickly extract things we want. Yes I think I have a good understanding about how to using it,

Lec 08

Terms: drop_na drops rows where any column specified contains a missing value distinct() only keeps one copy of rows that have exact duplicates in the data frame where(is.type) select columns by type or property, making code robust to new columns.

select() + stats_with("string") select() + ends_with("string") select() + contains("string") select() + mathces("regex")

across() calculate for each value i.e. summarize(across(where(is.numeric), mean)

left_join(x, y, by = "column name")

~ if ture, print

x <- c(50, 70, 90)
case_when(
  x <= 50 ~ "F",
  x <= 80 ~ "B",
  x >= 90 ~ "A"
)

## [1] "F" "B" "A"

data.frame(a = c(1,2, 3)) %>%
  mutate(b = if_else(a > 2, TRUE, FALSE))

##   a     b
## 1 1 FALSE
## 2 2 FALSE
## 3 3  TRUE

data.frame(a = c(1,2, 3)) %>%
  mutate(b = if_else(a > 2, FALSE, TRUE))

##   a     b
## 1 1  TRUE
## 2 2  TRUE
## 3 3 FALSE

data.frame(a = c(1,2, 3)) %>%
  mutate(b = if_else(a > 2, 1, 2))

##   a b
## 1 1 2
## 2 2 2
## 3 3 1

penguins %>%
  select()

## # A tibble: 333 × 0

penguins %>%
  drop_na %>%
  group_by(species) %>%
  summarize(across(where(is.numeric), mean))

## # A tibble: 3 × 6
##   species   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g  year
##   <fct>              <dbl>         <dbl>             <dbl>       <dbl> <dbl>
## 1 Adelie              38.8          18.3              190.       3706. 2008.
## 2 Chinstrap           48.8          18.4              196.       3733. 2008.
## 3 Gentoo              47.6          15.0              217.       5092. 2008.

penguins%>%
  mutate(
    heavy = if_else(body_mass_g >= 4500, "heavy", "light")) %>%
  select(body_mass_g, heavy)

## # A tibble: 333 × 2
##    body_mass_g heavy
##          <int> <chr>
##  1        3750 light
##  2        3800 light
##  3        3250 light
##  4        3450 light
##  5        3650 light
##  6        3625 light
##  7        4675 heavy
##  8        3200 light
##  9        3800 light
## 10        4400 light
## # ℹ 323 more rows

Homewrok

Q1

There is original 344 rows, after clean there is 333 rows.

library("palmerpenguins")
penguins_clean <- penguins %>%
  drop_na(bill_length_mm, sex)
nrow(penguins)

## [1] 333

nrow(penguins_clean)

## [1] 333

Q2

because repeated data are able to influence means and distribution

penguins_dup <- penguins_clean %>%
  bind_rows(
    slice_sample(penguins_clean, n = 30, replace = TRUE)
  )

nrow(penguins_dup)

## [1] 363

Q3

There is eight resulting data frame.

penguins_selected <- penguins_clean %>%
  select(species, island, sex, where(is.numeric)
  )

ncol(penguins_selected)

## [1] 8

Q4

penguins_clean %>%
  select(starts_with("bill"))

## # A tibble: 333 × 2
##    bill_length_mm bill_depth_mm
##             <dbl>         <dbl>
##  1           39.1          18.7
##  2           39.5          17.4
##  3           40.3          18  
##  4           36.7          19.3
##  5           39.3          20.6
##  6           38.9          17.8
##  7           39.2          19.6
##  8           41.1          17.6
##  9           38.6          21.2
## 10           34.6          21.1
## # ℹ 323 more rows

penguins_clean %>%
  select(ends_with("_mm"))

## # A tibble: 333 × 3
##    bill_length_mm bill_depth_mm flipper_length_mm
##             <dbl>         <dbl>             <int>
##  1           39.1          18.7               181
##  2           39.5          17.4               186
##  3           40.3          18                 195
##  4           36.7          19.3               193
##  5           39.3          20.6               190
##  6           38.9          17.8               181
##  7           39.2          19.6               195
##  8           41.1          17.6               182
##  9           38.6          21.2               191
## 10           34.6          21.1               198
## # ℹ 323 more rows

penguins_clean %>%
  select(matches("^(bill|flipper)"))

## # A tibble: 333 × 3
##    bill_length_mm bill_depth_mm flipper_length_mm
##             <dbl>         <dbl>             <int>
##  1           39.1          18.7               181
##  2           39.5          17.4               186
##  3           40.3          18                 195
##  4           36.7          19.3               193
##  5           39.3          20.6               190
##  6           38.9          17.8               181
##  7           39.2          19.6               195
##  8           41.1          17.6               182
##  9           38.6          21.2               191
## 10           34.6          21.1               198
## # ℹ 323 more rows

Q5

more concise, and less likely to type wrong.

penguins_clean %>%
  group_by(species) %>%
  summarize(across(where(is.numeric), mean))

## # A tibble: 3 × 6
##   species   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g  year
##   <fct>              <dbl>         <dbl>             <dbl>       <dbl> <dbl>
## 1 Adelie              38.8          18.3              190.       3706. 2008.
## 2 Chinstrap           48.8          18.4              196.       3733. 2008.
## 3 Gentoo              47.6          15.0              217.       5092. 2008.

Q6

penguins_clean <- penguins_clean %>%
  mutate(
    bill_length = case_when(
      bill_length_mm < 40 ~"short",
      bill_length_mm < 50 ~"medium",
      bill_length_mm >= 50 ~"long"
    )
  )
head(penguins_clean$bill_length)

## [1] "short"  "short"  "medium" "short"  "short"  "short"

Q7

The number of penguins_instrumented is 12, and penguins_clean is 9

instrument_meta <- tibble(
  instrument_id = c("caliper_A", "caliper_B", "caliper_C"),
  calibration_mm = c(0.2, 0.5, 1.0),
  manufacturer = c("Mitutoyo", "Fowler", "Generic")
)

penguins_instrumented <- penguins_clean %>%
  drop_na(bill_length_mm) %>% 
  mutate(
    instrument_id = case_when(
      year == 2007 ~ "caliper_A",
      year == 2008 ~ "caliper_B",
      year == 2009 ~ "caliper_C"
    )
  ) 
 
penguins_instrumented <- penguins_instrumented %>%
  left_join(instrument_meta, by = "instrument_id")

ncol(penguins_clean)

## [1] 9

ncol(penguins_instrumented)

## [1] 12

Q8

They are helpful, especially for the left joint. I face simiar question before in data analysis.

Unit2Project

2026-02-03

Q1

Q2

Q3

Q4

Q5

Q6

Q7

Lec 7

Unit Project 02 - L07: Regular Expressions

Regular Expressions (Regex)

Exercise

Homework

Unit Project 02 - L07: Regular Expressions

Q1

Q2

Q3

Q4

Q5

Q6

Q7

Q8

Lec 08

Homewrok

Q1

Q2

Q3

Q4

Q5

Q6

Q7

Q8