Strings: Pre-class Video (Part 1)

You can download this .qmd file from here. Just hit the Download Raw File button.

This uses parts of R4DS Ch 14: Strings and Ch 15: Regular Expressions (both the first and second editions).

library(tidyverse)
#spotify <- read_csv("Data/spotify.csv") 
spotify <- read_csv("https://proback.github.io/264_fall_2024/Data/spotify.csv")

spot_smaller <- spotify |>
  select(
    title, 
    artist, 
    album_release_date, 
    album_name, 
    subgenre, 
    playlist_name
  )

spot_smaller <- spot_smaller[c(5, 32, 49, 52, 83, 175, 219, 231, 246, 265), ]
spot_smaller

# A tibble: 10 × 6
   title             artist album_release_date album_name subgenre playlist_name
   <chr>             <chr>  <chr>              <chr>      <chr>    <chr>        
 1 Hear Me Now       Alok   2016-01-01         Hear Me N… indie p… "Chillout & …
 2 Run the World (G… Beyon… 2011-06-24         4          post-te… "post-teen a…
 3 Formation         Beyon… 2016-04-23         Lemonade   hip pop  "Feeling Acc…
 4 7/11              Beyon… 2014-11-24         BEYONCÉ [… hip pop  "Feeling Acc…
 5 My Oh My (feat. … Camil… 2019-12-06         Romance    latin p… "2020 Hits &…
 6 It's Automatic    Frees… 2013-11-28         It's Auto… latin h… "80's Freest…
 7 Poetic Justice    Kendr… 2012               good kid,… hip hop  "Hip Hop Con…
 8 A.D.H.D           Kendr… 2011-07-02         Section.80 souther… "Hip-Hop 'n …
 9 Ya Estuvo         Kid F… 1990-01-01         Hispanic … latin h… "HIP-HOP: La…
10 Runnin (with A$A… Mike … 2018-11-16         Creed II:… gangste… "RAP Gangsta"

A string is just a set of characters.

single_string <- "this is a string!"
single_string

[1] "this is a string!"

string_vector <- c("this", "is", "a", "vector", "of strings")
string_vector

[1] "this"       "is"         "a"          "vector"     "of strings"

# This is a tibble with many columns of "string variables", or "character variables"
spot_smaller

# A tibble: 10 × 6
   title             artist album_release_date album_name subgenre playlist_name
   <chr>             <chr>  <chr>              <chr>      <chr>    <chr>        
 1 Hear Me Now       Alok   2016-01-01         Hear Me N… indie p… "Chillout & …
 2 Run the World (G… Beyon… 2011-06-24         4          post-te… "post-teen a…
 3 Formation         Beyon… 2016-04-23         Lemonade   hip pop  "Feeling Acc…
 4 7/11              Beyon… 2014-11-24         BEYONCÉ [… hip pop  "Feeling Acc…
 5 My Oh My (feat. … Camil… 2019-12-06         Romance    latin p… "2020 Hits &…
 6 It's Automatic    Frees… 2013-11-28         It's Auto… latin h… "80's Freest…
 7 Poetic Justice    Kendr… 2012               good kid,… hip hop  "Hip Hop Con…
 8 A.D.H.D           Kendr… 2011-07-02         Section.80 souther… "Hip-Hop 'n …
 9 Ya Estuvo         Kid F… 1990-01-01         Hispanic … latin h… "HIP-HOP: La…
10 Runnin (with A$A… Mike … 2018-11-16         Creed II:… gangste… "RAP Gangsta"

# Each column of the tibble is a vector of strings.
spot_smaller$title

 [1] "Hear Me Now"                                      
 [2] "Run the World (Girls)"                            
 [3] "Formation"                                        
 [4] "7/11"                                             
 [5] "My Oh My (feat. DaBaby)"                          
 [6] "It's Automatic"                                   
 [7] "Poetic Justice"                                   
 [8] "A.D.H.D"                                          
 [9] "Ya Estuvo"                                        
[10] "Runnin (with A$AP Rocky, A$AP Ferg & Nicki Minaj)"

# Each item in the tibble is a string.
spot_smaller$title[1]

[1] "Hear Me Now"

Functions that start str_ do stuff to strings!

str_length()

# when the input to str_length is a single string, the output is a single value:
str_length("hi")

[1] 2

str_length(single_string)

[1] 17

# when the input to str_length is a vector, the output is a vector:
str_length(string_vector)

[1]  4  2  1  6 10

str_length takes a vector input and creates a vector output (or a single value input and returns a single value output)…. this makes it easy to use within a mutate!

spot_smaller |>
  select(title) |>
  mutate(title_length = str_length(title))

# A tibble: 10 × 2
   title                                             title_length
   <chr>                                                    <int>
 1 Hear Me Now                                                 11
 2 Run the World (Girls)                                       21
 3 Formation                                                    9
 4 7/11                                                         4
 5 My Oh My (feat. DaBaby)                                     23
 6 It's Automatic                                              14
 7 Poetic Justice                                              14
 8 A.D.H.D                                                      7
 9 Ya Estuvo                                                    9
10 Runnin (with A$AP Rocky, A$AP Ferg & Nicki Minaj)           49

str_sub()

This function creates substrings (shorter strings)

# When the input is a single string, the output is a single string
single_string

[1] "this is a string!"

str_sub(single_string, 1, 7)

[1] "this is"

str_sub(single_string, 8, 9)

[1] " a"

str_sub(single_string, 9, 9)

[1] "a"

# When the input is a vector of strings, what do you think the output will be?
string_vector

[1] "this"       "is"         "a"          "vector"     "of strings"

str_sub(string_vector, 1, 2)

[1] "th" "is" "a"  "ve" "of"

How can we use str_sub to get just the year of the album_release_date? Try it here! Then scroll down for solution.

spot_smaller

# A tibble: 10 × 6
   title             artist album_release_date album_name subgenre playlist_name
   <chr>             <chr>  <chr>              <chr>      <chr>    <chr>        
 1 Hear Me Now       Alok   2016-01-01         Hear Me N… indie p… "Chillout & …
 2 Run the World (G… Beyon… 2011-06-24         4          post-te… "post-teen a…
 3 Formation         Beyon… 2016-04-23         Lemonade   hip pop  "Feeling Acc…
 4 7/11              Beyon… 2014-11-24         BEYONCÉ [… hip pop  "Feeling Acc…
 5 My Oh My (feat. … Camil… 2019-12-06         Romance    latin p… "2020 Hits &…
 6 It's Automatic    Frees… 2013-11-28         It's Auto… latin h… "80's Freest…
 7 Poetic Justice    Kendr… 2012               good kid,… hip hop  "Hip Hop Con…
 8 A.D.H.D           Kendr… 2011-07-02         Section.80 souther… "Hip-Hop 'n …
 9 Ya Estuvo         Kid F… 1990-01-01         Hispanic … latin h… "HIP-HOP: La…
10 Runnin (with A$A… Mike … 2018-11-16         Creed II:… gangste… "RAP Gangsta"

. . . . . . . .

spot_smaller |>
  select(title, artist, album_release_date) |>
  mutate(album_release_year = str_sub(album_release_date, 1, 4))

# A tibble: 10 × 4
   title                            artist album_release_date album_release_year
   <chr>                            <chr>  <chr>              <chr>             
 1 Hear Me Now                      Alok   2016-01-01         2016              
 2 Run the World (Girls)            Beyon… 2011-06-24         2011              
 3 Formation                        Beyon… 2016-04-23         2016              
 4 7/11                             Beyon… 2014-11-24         2014              
 5 My Oh My (feat. DaBaby)          Camil… 2019-12-06         2019              
 6 It's Automatic                   Frees… 2013-11-28         2013              
 7 Poetic Justice                   Kendr… 2012               2012              
 8 A.D.H.D                          Kendr… 2011-07-02         2011              
 9 Ya Estuvo                        Kid F… 1990-01-01         1990              
10 Runnin (with A$AP Rocky, A$AP F… Mike … 2018-11-16         2018

str_c()

This collapses multiple strings together into one string.

str_c("is", "this output", "a", "single value", "or", "a vector", "?")

[1] "isthis outputasingle valueora vector?"

# like unite and separate, we can specify the separator:

str_c("is", "this output", "a", "single value", "or", "a vector", "?", 
      sep = " ")

[1] "is this output a single value or a vector ?"

We can see that the input is a list of strings, and the output is a single string.

So… why is this useful?

x <- runif(1)
x

[1] 0.2292812

str_c("I can put other values, like", x, "in here!", sep = " ")

[1] "I can put other values, like 0.229281222214922 in here!"

spot_smaller

# A tibble: 10 × 6
   title             artist album_release_date album_name subgenre playlist_name
   <chr>             <chr>  <chr>              <chr>      <chr>    <chr>        
 1 Hear Me Now       Alok   2016-01-01         Hear Me N… indie p… "Chillout & …
 2 Run the World (G… Beyon… 2011-06-24         4          post-te… "post-teen a…
 3 Formation         Beyon… 2016-04-23         Lemonade   hip pop  "Feeling Acc…
 4 7/11              Beyon… 2014-11-24         BEYONCÉ [… hip pop  "Feeling Acc…
 5 My Oh My (feat. … Camil… 2019-12-06         Romance    latin p… "2020 Hits &…
 6 It's Automatic    Frees… 2013-11-28         It's Auto… latin h… "80's Freest…
 7 Poetic Justice    Kendr… 2012               good kid,… hip hop  "Hip Hop Con…
 8 A.D.H.D           Kendr… 2011-07-02         Section.80 souther… "Hip-Hop 'n …
 9 Ya Estuvo         Kid F… 1990-01-01         Hispanic … latin h… "HIP-HOP: La…
10 Runnin (with A$A… Mike … 2018-11-16         Creed II:… gangste… "RAP Gangsta"

song_count <- spot_smaller |> 
  count(artist) |>
  slice_max(n, n = 1)

song_count

# A tibble: 1 × 2
  artist      n
  <chr>   <int>
1 Beyoncé     3

song_count$artist

[1] "Beyoncé"

song_count$n

[1] 3

str_c("The artist with the most songs in spot_smaller is", song_count$artist, "with", song_count$n, "songs.", sep = " ")

[1] "The artist with the most songs in spot_smaller is Beyoncé with 3 songs."

We can use this in a tibble too.

spot_smaller |>
  select(artist, title) |>
  mutate(song_by = str_c(title, "by", artist, sep = " ")) |>
  select(song_by)

# A tibble: 10 × 1
   song_by                                                               
   <chr>                                                                 
 1 Hear Me Now by Alok                                                   
 2 Run the World (Girls) by Beyoncé                                      
 3 Formation by Beyoncé                                                  
 4 7/11 by Beyoncé                                                       
 5 My Oh My (feat. DaBaby) by Camila Cabello                             
 6 It's Automatic by Freestyle                                           
 7 Poetic Justice by Kendrick Lamar                                      
 8 A.D.H.D by Kendrick Lamar                                             
 9 Ya Estuvo by Kid Frost                                                
10 Runnin (with A$AP Rocky, A$AP Ferg & Nicki Minaj) by Mike WiLL Made-It

str_to_lower(), str_to_upper(), str_to_title()

These are pretty self explanatory.

spot_smaller |>
  select(title) |>
  mutate(title_to_lower = str_to_lower(title),
         title_to_upper = str_to_upper(title))

# A tibble: 10 × 3
   title                                           title_to_lower title_to_upper
   <chr>                                           <chr>          <chr>         
 1 Hear Me Now                                     hear me now    HEAR ME NOW   
 2 Run the World (Girls)                           run the world… RUN THE WORLD…
 3 Formation                                       formation      FORMATION     
 4 7/11                                            7/11           7/11          
 5 My Oh My (feat. DaBaby)                         my oh my (fea… MY OH MY (FEA…
 6 It's Automatic                                  it's automatic IT'S AUTOMATIC
 7 Poetic Justice                                  poetic justice POETIC JUSTICE
 8 A.D.H.D                                         a.d.h.d        A.D.H.D       
 9 Ya Estuvo                                       ya estuvo      YA ESTUVO     
10 Runnin (with A$AP Rocky, A$AP Ferg & Nicki Min… runnin (with … RUNNIN (WITH …

# title is already in title case, so: 
str_to_title("makes this into title case")

[1] "Makes This Into Title Case"

Matching Patterns

In addition to manipulating strings, we might what to search through them to find matches. For example, can I find all the songs that start with M? The songs from 2016? The album titles that include a number?

str_view()

This function is helpful for viewing. It returns rows that contain the pattern you’re searching for, highlighting the pattern between <.> symbols and in a different color.

The first input is the vector, and the second input is the string/substring/pattern you are looking for.

str_view(spot_smaller$album_release_date, "2016")

[1] │ <2016>-01-01
[3] │ <2016>-04-23

str_view(spot_smaller$title, "M")

 [1] │ Hear <M>e Now
 [5] │ <M>y Oh <M>y (feat. DaBaby)
[10] │ Runnin (with A$AP Rocky, A$AP Ferg & Nicki <M>inaj)

str_view(spot_smaller$subgenre, "pop")

[1] │ indie <pop>timism
[2] │ post-teen <pop>
[3] │ hip <pop>
[4] │ hip <pop>
[5] │ latin <pop>

str_view(spot_smaller$subgenre, "hip hop")

[6] │ latin <hip hop>
[7] │ <hip hop>
[8] │ southern <hip hop>
[9] │ latin <hip hop>

str_subset()

str_subset() takes a vector input and returns a (usually shorter) vector output. Compare the output from str_view() and str_subset() here. Both of these functions can be hard to work with in a tibble.

str_view(spot_smaller$title, "M")

 [1] │ Hear <M>e Now
 [5] │ <M>y Oh <M>y (feat. DaBaby)
[10] │ Runnin (with A$AP Rocky, A$AP Ferg & Nicki <M>inaj)

str_subset(spot_smaller$title, "M")

[1] "Hear Me Now"                                      
[2] "My Oh My (feat. DaBaby)"                          
[3] "Runnin (with A$AP Rocky, A$AP Ferg & Nicki Minaj)"

str_detect()

str_detect takes a vector of strings (or single string) input and returns a vector of TRUE/FALSE (or single value). This makes it easy to work with in tibbles, using mutate or filter.

str_view(spot_smaller$title, "M")

 [1] │ Hear <M>e Now
 [5] │ <M>y Oh <M>y (feat. DaBaby)
[10] │ Runnin (with A$AP Rocky, A$AP Ferg & Nicki <M>inaj)

str_detect(spot_smaller$title, "M")

 [1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE

str_detect("hello", "ll")

[1] TRUE

spot_smaller |> 
  select(title, album_name, artist) |>
  mutate(includes_M = str_detect(title, "M"))

# A tibble: 10 × 4
   title                                            album_name artist includes_M
   <chr>                                            <chr>      <chr>  <lgl>     
 1 Hear Me Now                                      Hear Me N… Alok   TRUE      
 2 Run the World (Girls)                            4          Beyon… FALSE     
 3 Formation                                        Lemonade   Beyon… FALSE     
 4 7/11                                             BEYONCÉ [… Beyon… FALSE     
 5 My Oh My (feat. DaBaby)                          Romance    Camil… TRUE      
 6 It's Automatic                                   It's Auto… Frees… FALSE     
 7 Poetic Justice                                   good kid,… Kendr… FALSE     
 8 A.D.H.D                                          Section.80 Kendr… FALSE     
 9 Ya Estuvo                                        Hispanic … Kid F… FALSE     
10 Runnin (with A$AP Rocky, A$AP Ferg & Nicki Mina… Creed II:… Mike … TRUE

spot_smaller |>  
  select(title, album_name, artist) |>
  filter(str_detect(title, "M"))

# A tibble: 3 × 3
  title                                             album_name          artist  
  <chr>                                             <chr>               <chr>   
1 Hear Me Now                                       Hear Me Now         Alok    
2 My Oh My (feat. DaBaby)                           Romance             Camila …
3 Runnin (with A$AP Rocky, A$AP Ferg & Nicki Minaj) Creed II: The Album Mike Wi…

spot_smaller |> 
   select(title, album_name, artist, subgenre) |>
   filter(str_detect(subgenre, "pop"))

# A tibble: 5 × 4
  title                   album_name                 artist         subgenre    
  <chr>                   <chr>                      <chr>          <chr>       
1 Hear Me Now             Hear Me Now                Alok           indie popti…
2 Run the World (Girls)   4                          Beyoncé        post-teen p…
3 Formation               Lemonade                   Beyoncé        hip pop     
4 7/11                    BEYONCÉ [Platinum Edition] Beyoncé        hip pop     
5 My Oh My (feat. DaBaby) Romance                    Camila Cabello latin pop

str_extract()

str_extract() takes a vector (or single) of strings input and returns a vector (or single) string output

single_string

[1] "this is a string!"

str_extract(single_string, "this")

[1] "this"

str_extract() is more interesting when we want to identify a particular pattern to extract from the string.

For instance:

str_extract("find first vowel", "[aeiou]")

[1] "i"

str_extract("any numb3rs?", "\\d")

[1] "3"

numbers_here <- c("numb3rs", "ar3", "h1d1ing", "almost", "ev3ryw4ere")

str_extract(numbers_here, "\\d")

[1] "3" "3" "1" NA  "3"

str_view(numbers_here, "\\d")

[1] │ numb<3>rs
[2] │ ar<3>
[3] │ h<1>d<1>ing
[5] │ ev<3>ryw<4>ere

Because str_extract returns a vector of the same length as its input, it also can be used within a tibble.

spot_smaller |>
  select(title, artist, album_name) |>
  mutate(numbers = str_extract(album_name, "\\d"))

# A tibble: 10 × 4
   title                                             artist   album_name numbers
   <chr>                                             <chr>    <chr>      <chr>  
 1 Hear Me Now                                       Alok     Hear Me N… <NA>   
 2 Run the World (Girls)                             Beyoncé  4          4      
 3 Formation                                         Beyoncé  Lemonade   <NA>   
 4 7/11                                              Beyoncé  BEYONCÉ [… <NA>   
 5 My Oh My (feat. DaBaby)                           Camila … Romance    <NA>   
 6 It's Automatic                                    Freesty… It's Auto… <NA>   
 7 Poetic Justice                                    Kendric… good kid,… <NA>   
 8 A.D.H.D                                           Kendric… Section.80 8      
 9 Ya Estuvo                                         Kid Fro… Hispanic … <NA>   
10 Runnin (with A$AP Rocky, A$AP Ferg & Nicki Minaj) Mike Wi… Creed II:… <NA>

The patterns we show here, “\d” and “[aeiou]” are called regular expressions.

Regular Expressions

Regular expressions are a way to write general patterns… for instance the string “\d” will find any digit (number). We can also specify whether we want the string to start or end with a certain letter.

Notice the difference between the regular expression “M” and “^M”, “o” and “o$”

str_view(spot_smaller$title, "M")

 [1] │ Hear <M>e Now
 [5] │ <M>y Oh <M>y (feat. DaBaby)
[10] │ Runnin (with A$AP Rocky, A$AP Ferg & Nicki <M>inaj)

str_view(spot_smaller$title, "^M")

[5] │ <M>y Oh My (feat. DaBaby)

str_view(spot_smaller$title, "o")

 [1] │ Hear Me N<o>w
 [2] │ Run the W<o>rld (Girls)
 [3] │ F<o>rmati<o>n
 [6] │ It's Aut<o>matic
 [7] │ P<o>etic Justice
 [9] │ Ya Estuv<o>
[10] │ Runnin (with A$AP R<o>cky, A$AP Ferg & Nicki Minaj)

str_view(spot_smaller$title, "o$")

[9] │ Ya Estuv<o>

But how do I look for a dollar sign in my string? I use to “escape” the special behavior of $. But itself has special behavior… so I need two of them.

str_view(spot_smaller$title, "\\$")

[10] │ Runnin (with A<$>AP Rocky, A<$>AP Ferg & Nicki Minaj)

Example problem

Are there any album names that contain numbers?

step 1: use str_view() to figure out an appropriate regular expression to use for searching.

str_view(spot_smaller$album_name, "\\d")

[2] │ <4>
[8] │ Section.<8><0>

step 2: what kind of output do I want?

# A list of the album names?
str_subset(spot_smaller$album_name, "\\d")

[1] "4"          "Section.80"

# A tibble? 
spot_smaller |>
  filter(str_detect(album_name, "\\d"))

# A tibble: 2 × 6
  title              artist album_release_date album_name subgenre playlist_name
  <chr>              <chr>  <chr>              <chr>      <chr>    <chr>        
1 Run the World (Gi… Beyon… 2011-06-24         4          post-te… post-teen al…
2 A.D.H.D            Kendr… 2011-07-02         Section.80 souther… Hip-Hop 'n R…

More regular expressions

[abc] - a, b, or c

str_view(spot_smaller$subgenre, "[hp]op")

[1] │ indie <pop>timism
[2] │ post-teen <pop>
[3] │ hip <pop>
[4] │ hip <pop>
[5] │ latin <pop>
[6] │ latin hip <hop>
[7] │ hip <hop>
[8] │ southern hip <hop>
[9] │ latin hip <hop>

str_view(spot_smaller$subgenre, "hip [hp]op")

[3] │ <hip pop>
[4] │ <hip pop>
[6] │ latin <hip hop>
[7] │ <hip hop>
[8] │ southern <hip hop>
[9] │ latin <hip hop>

[^abc] anything EXCEPT abc.

str_view(spot_smaller$album_name, "[^\\d]")

 [1] │ <H><e><a><r>< ><M><e>< ><N><o><w>
 [3] │ <L><e><m><o><n><a><d><e>
 [4] │ <B><E><Y><O><N><C><É>< ><[><P><l><a><t><i><n><u><m>< ><E><d><i><t><i><o><n><]>
 [5] │ <R><o><m><a><n><c><e>
 [6] │ <I><t><'><s>< ><A><u><t><o><m><a><t><i><c>
 [7] │ <g><o><o><d>< ><k><i><d><,>< ><m><.><A><.><A><.><d>< ><c><i><t><y>< ><(><D><e><l><u><x><e><)>
 [8] │ <S><e><c><t><i><o><n><.>80
 [9] │ <H><i><s><p><a><n><i><c>< ><C><a><u><s><i><n><g>< ><P><a><n><i><c>
[10] │ <C><r><e><e><d>< ><I><I><:>< ><T><h><e>< ><A><l><b><u><m>

str_view(spot_smaller$album_name, "[^a-zA-Z ]")

 [2] │ <4>
 [4] │ BEYONC<É> <[>Platinum Edition<]>
 [6] │ It<'>s Automatic
 [7] │ good kid<,> m<.>A<.>A<.>d city <(>Deluxe<)>
 [8] │ Section<.><8><0>
[10] │ Creed II<:> The Album

Bonus content not in the pre-class video

str_glue()

This is a nice alternative to str_c(), where you only need a single set of quotes, and anything inside curly brackets {} is evaluated like it’s outside the quotes.

# Thus, this code from earlier...

spot_smaller

# A tibble: 10 × 6
   title             artist album_release_date album_name subgenre playlist_name
   <chr>             <chr>  <chr>              <chr>      <chr>    <chr>        
 1 Hear Me Now       Alok   2016-01-01         Hear Me N… indie p… "Chillout & …
 2 Run the World (G… Beyon… 2011-06-24         4          post-te… "post-teen a…
 3 Formation         Beyon… 2016-04-23         Lemonade   hip pop  "Feeling Acc…
 4 7/11              Beyon… 2014-11-24         BEYONCÉ [… hip pop  "Feeling Acc…
 5 My Oh My (feat. … Camil… 2019-12-06         Romance    latin p… "2020 Hits &…
 6 It's Automatic    Frees… 2013-11-28         It's Auto… latin h… "80's Freest…
 7 Poetic Justice    Kendr… 2012               good kid,… hip hop  "Hip Hop Con…
 8 A.D.H.D           Kendr… 2011-07-02         Section.80 souther… "Hip-Hop 'n …
 9 Ya Estuvo         Kid F… 1990-01-01         Hispanic … latin h… "HIP-HOP: La…
10 Runnin (with A$A… Mike … 2018-11-16         Creed II:… gangste… "RAP Gangsta"

song_count <- spot_smaller |> 
  count(artist) |>
  slice_max(n, n = 1)
song_count

# A tibble: 1 × 2
  artist      n
  <chr>   <int>
1 Beyoncé     3

str_c("The artist with the most songs in spot_smaller is", song_count$artist, "with", song_count$n, "songs.", sep = " ")

[1] "The artist with the most songs in spot_smaller is Beyoncé with 3 songs."

# ... becomes this:

song_count |> mutate(statement = str_glue("The artist with the most songs in spot_smaller is {artist} with {n} songs."))

# A tibble: 1 × 3
  artist      n statement                                                       
  <chr>   <int> <glue>                                                          
1 Beyoncé     3 The artist with the most songs in spot_smaller is Beyoncé with …

# or 

str_glue("The artist with the most songs in spot_smaller is {song_count$artist} with {song_count$n} songs.")

The artist with the most songs in spot_smaller is Beyoncé with 3 songs.

str_glue() can also be applied to an entire column vector:

spot_smaller |>
  mutate(statement = str_glue("{artist} released {album_name} on {album_release_date}.")) |>
  select(statement)

# A tibble: 10 × 1
   statement                                                       
   <glue>                                                          
 1 Alok released Hear Me Now on 2016-01-01.                        
 2 Beyoncé released 4 on 2011-06-24.                               
 3 Beyoncé released Lemonade on 2016-04-23.                        
 4 Beyoncé released BEYONCÉ [Platinum Edition] on 2014-11-24.      
 5 Camila Cabello released Romance on 2019-12-06.                  
 6 Freestyle released It's Automatic on 2013-11-28.                
 7 Kendrick Lamar released good kid, m.A.A.d city (Deluxe) on 2012.
 8 Kendrick Lamar released Section.80 on 2011-07-02.               
 9 Kid Frost released Hispanic Causing Panic on 1990-01-01.        
10 Mike WiLL Made-It released Creed II: The Album on 2018-11-16.

And if you wanted to include {} in your statement, you can double up {} to serve as an escape character:

spot_smaller |>
  mutate(statement = str_glue("{artist} released {album_name} on {album_release_date} {{according to Spotify}}.")) |>
  select(statement)

# A tibble: 10 × 1
   statement                                                                    
   <glue>                                                                       
 1 Alok released Hear Me Now on 2016-01-01 {according to Spotify}.              
 2 Beyoncé released 4 on 2011-06-24 {according to Spotify}.                     
 3 Beyoncé released Lemonade on 2016-04-23 {according to Spotify}.              
 4 Beyoncé released BEYONCÉ [Platinum Edition] on 2014-11-24 {according to Spot…
 5 Camila Cabello released Romance on 2019-12-06 {according to Spotify}.        
 6 Freestyle released It's Automatic on 2013-11-28 {according to Spotify}.      
 7 Kendrick Lamar released good kid, m.A.A.d city (Deluxe) on 2012 {according t…
 8 Kendrick Lamar released Section.80 on 2011-07-02 {according to Spotify}.     
 9 Kid Frost released Hispanic Causing Panic on 1990-01-01 {according to Spotif…
10 Mike WiLL Made-It released Creed II: The Album on 2018-11-16 {according to S…

separate_wider_delim() and its cousins

When multiple variables are crammed together into a single string, the separate_ functions can be used to extract the pieces are produce additional rows (longer) or columns (wider). We show one such example below, using the optional “too_few” setting to diagnose issues after getting a warning message the first time.

spot_smaller |>
  separate_wider_delim(
    album_release_date,
    delim = "-",
    names = c("year", "month", "day"),
    too_few = "debug"
  ) |>
  print(width = Inf)

Warning: Debug mode activated: adding variables `album_release_date_ok`,
`album_release_date_pieces`, and `album_release_date_remainder`.

# A tibble: 10 × 12
   title                                             artist            year 
   <chr>                                             <chr>             <chr>
 1 Hear Me Now                                       Alok              2016 
 2 Run the World (Girls)                             Beyoncé           2011 
 3 Formation                                         Beyoncé           2016 
 4 7/11                                              Beyoncé           2014 
 5 My Oh My (feat. DaBaby)                           Camila Cabello    2019 
 6 It's Automatic                                    Freestyle         2013 
 7 Poetic Justice                                    Kendrick Lamar    2012 
 8 A.D.H.D                                           Kendrick Lamar    2011 
 9 Ya Estuvo                                         Kid Frost         1990 
10 Runnin (with A$AP Rocky, A$AP Ferg & Nicki Minaj) Mike WiLL Made-It 2018 
   month day   album_release_date album_release_date_ok
   <chr> <chr> <chr>              <lgl>                
 1 01    01    2016-01-01         TRUE                 
 2 06    24    2011-06-24         TRUE                 
 3 04    23    2016-04-23         TRUE                 
 4 11    24    2014-11-24         TRUE                 
 5 12    06    2019-12-06         TRUE                 
 6 11    28    2013-11-28         TRUE                 
 7 <NA>  <NA>  2012               FALSE                
 8 07    02    2011-07-02         TRUE                 
 9 01    01    1990-01-01         TRUE                 
10 11    16    2018-11-16         TRUE                 
   album_release_date_pieces album_release_date_remainder
                       <int> <chr>                       
 1                         3 ""                          
 2                         3 ""                          
 3                         3 ""                          
 4                         3 ""                          
 5                         3 ""                          
 6                         3 ""                          
 7                         1 ""                          
 8                         3 ""                          
 9                         3 ""                          
10                         3 ""                          
   album_name                      subgenre        
   <chr>                           <chr>           
 1 Hear Me Now                     indie poptimism 
 2 4                               post-teen pop   
 3 Lemonade                        hip pop         
 4 BEYONCÉ [Platinum Edition]      hip pop         
 5 Romance                         latin pop       
 6 It's Automatic                  latin hip hop   
 7 good kid, m.A.A.d city (Deluxe) hip hop         
 8 Section.80                      southern hip hop
 9 Hispanic Causing Panic          latin hip hop   
10 Creed II: The Album             gangster rap    
   playlist_name                                                              
   <chr>                                                                      
 1 "Chillout & Remixes \U0001f49c"                                            
 2 "post-teen alternative, indie, pop (large variety)"                        
 3 "Feeling Accomplished"                                                     
 4 "Feeling Accomplished"                                                     
 5 "2020 Hits & 2019  Hits – Top Global Tracks \U0001f525\U0001f525\U0001f525"
 6 "80's Freestyle/Disco Dance Party (Set Crossfade to 4-Seconds)"            
 7 "Hip Hop Controller"                                                       
 8 "Hip-Hop 'n RnB"                                                           
 9 "HIP-HOP: Latin Rap ['89-present]"                                         
10 "RAP Gangsta"

spot_smaller |>
  separate_wider_delim(
    album_release_date,
    delim = "-",
    names = c("year", "month", "day"),
    too_few = "align_start"
  )

# A tibble: 10 × 8
   title              artist year  month day   album_name subgenre playlist_name
   <chr>              <chr>  <chr> <chr> <chr> <chr>      <chr>    <chr>        
 1 Hear Me Now        Alok   2016  01    01    Hear Me N… indie p… "Chillout & …
 2 Run the World (Gi… Beyon… 2011  06    24    4          post-te… "post-teen a…
 3 Formation          Beyon… 2016  04    23    Lemonade   hip pop  "Feeling Acc…
 4 7/11               Beyon… 2014  11    24    BEYONCÉ [… hip pop  "Feeling Acc…
 5 My Oh My (feat. D… Camil… 2019  12    06    Romance    latin p… "2020 Hits &…
 6 It's Automatic     Frees… 2013  11    28    It's Auto… latin h… "80's Freest…
 7 Poetic Justice     Kendr… 2012  <NA>  <NA>  good kid,… hip hop  "Hip Hop Con…
 8 A.D.H.D            Kendr… 2011  07    02    Section.80 souther… "Hip-Hop 'n …
 9 Ya Estuvo          Kid F… 1990  01    01    Hispanic … latin h… "HIP-HOP: La…
10 Runnin (with A$AP… Mike … 2018  11    16    Creed II:… gangste… "RAP Gangsta"

If there is a definable pattern, but the pattern is a bit weird, we can often use separate_wider_regex() to extract the correct values and build a tidy data set:

df <- tribble(
  ~str,
  "<Sheryl>-F_34",
  "<Kisha>-F_45", 
  "<Brandon>-N_33",
  "<Sharon>-F_38", 
  "<Penny>-F_58",
  "<Justin>-M_41", 
  "<Patricia>-F_84", 
)

df |> 
  separate_wider_regex(
    str,
    patterns = c(
      "<", 
      name = "[A-Za-z]+", 
      ">-", 
      gender = ".",
      "_",
      age = "[0-9]+"
    )
  )

# A tibble: 7 × 3
  name     gender age  
  <chr>    <chr>  <chr>
1 Sheryl   F      34   
2 Kisha    F      45   
3 Brandon  N      33   
4 Sharon   F      38   
5 Penny    F      58   
6 Justin   M      41   
7 Patricia F      84