stringr-grammar.Rmd

title: "Textdaten bearbeiten mit **stringr**"
author: "B. Philipp Kleer"
date: "11. Oktober 2021"
output:
  html_document:
      toc: true
      toc_float:
        toc_collapsed: true
        smooth_scroll: true
      toc_depth: 3
      widescreen: true
      highlight: pygments
      theme: readable
      css: styles/style.css
      df_print: paged
      mathjax: default
      self_contained: false
      incremental: false #True  dann jedes Bullet einzeln
      collapse: true # means the text output will be merged into the R source code block
      includes:
        after_body: ./styles/footer.html
        before_body: ./styles/header.html
library("knitr")
library("rmarkdown")
library("tidyverse")

gbbo <- readRDS("../datasets/gbbo.rds")

opts_chunk$set(#fig.path = 'pics/s6-', # path for calculated figures
               fig.align = 'center',  # alignment of figure (also possible right, left, default)
               fig.show = 'hold', # how to show figures: hold -> direct at the end of code chunk; animate: all plots in an animation
               fig.width = 6,   # figure width
               fig.height = 6,  # figure height
               echo = TRUE,     # Code is printed
               eval = FALSE,    # Code is NOT evaluated
               warning = FALSE, # warnings are NOT displayed
               message = FALSE, # messages are NOT displayed
               size = "tiny",  # latex-size of code chunks
               background = "#E7E7E7", # background color of code chunks
               comment = "", # no hashtags before output
               options(width = 80),
               results = "markdown",
               rows.print = 15
)

htmltools::tagList(
  xaringanExtra::use_clipboard(
    button_text = "<i class=\"fa fa-clipboard\"></i>",
    success_text = "<i class=\"fa fa-check\" style=\"color: #90BE6D\"></i>",
    error_text = "<i class=\"fa fa-times-circle\" style=\"color: #F94144\"></i>"
  ),
  rmarkdown::html_dependency_font_awesome()
)
# install.packages("tidyverse")
library("tidyverse")

# alternativ:
# install.packages("stringr")
# library("stringr")
gbbo <- readRDS("../datasets/gbbo.rds")
 # oder eigenen Pfad, wenn nicht in Cloud
gbbo
tweet <- gbbo %>%
  select(text)

tweet$id <- seq(1,
                862,
                1
                )

head(tweet,
     n = 100
     )
str_length(tweet$text)

# oder für einen spezifischen
str_length(tweet$text[23])

# oder mit piping
tweet %>%
  filter(id == 275) %>%
  select(text) %>%
  str_length()

# hier der 23. tweet
str_sub(tweet$text[23],
        2,
        5
        )

tweet %>%
  filter(id == 23) %>%
  select(text) %>%
  str_sub(2,
          5
          )
# hier der 23. tweet
str_sub(tweet$text[23],
        25,
        25
        )

str_sub(tweet$text[23],
        -23,
        -2
        )
tweet$text[23]

str_sub(tweet$text[23],
        26,
        26
        ) <- "M"

tweet$text[23]
str_dup(tweet$text[23],
        4
        )
tweet %>%
  filter(id == 23) %>%
  select(text) %>%
  str_sub(1,
          8
          ) %>%
  str_dup(4)
partOfTweet  <- tweet$text[c(23,
                             731
                             )
                           ]

partOfTweet
partOfTweet <- str_pad(partOfTweet,
                       100,
                       side = "both"
                       )

partOfTweet
tweet <- tweet %>%
  str_pad(text,
          100,
          side = "both"
          )
tweet <- tweet %>%
  mutate(shortT = str_pad(text,
                          100,
                          side = "both"
                          )
         )

head(tweet$shortT,
     n = 50
     )
tweet <- tweet %>%
  mutate(truncT = str_trunc(text,
                            50
                            ),
         truncT = str_pad(truncT,
                          60,
                          side = "both"
                          )
         )

head(tweet$truncT,
     n = 50
     )
tweet <- tweet %>%
  mutate(truncT2 = str_trim(truncT,
                            side = "both"
                            )
         )

head(tweet$truncT2,
     n = 50
     )
tweet <- tweet %>%
  mutate(truncT2 = str_to_lower(truncT2
                                )
         )

head(tweet$truncT2,
     n = 50
     )
tweet2 <- tweet %>%
  filter(id < 51) %>%
  select(id,
         text
         )
tweet2 <- tweet2 %>%
  mutate(includesThanks = str_detect(text,
                                     "thanks"
                                     )
         )

table(tweet2$includesThanks)
tweet2$text[7]
tweet2 <- tweet2 %>%
  mutate(includeThanks = str_detect(text,
                                    regex("thanks",
                                          ignore_case = TRUE
                                          )
                                    )
         )

table(tweet2$includeThanks)
beispiel <- c("Thanks! It means a lot to me",
              "I hope I see you on Thanksgiving",
              "Maybe next time!")

str_detect(beispiel,
           regex("thanks",
                 ignore_case = TRUE
                 )
           )
str_detect(beispiel,
           regex("\\s?thanks(\\s|[:punct:])",
                 ignore_case = TRUE
                 )
           )
tweet2 <- tweet2 %>%
  mutate(countPrue = str_count(text,
                               regex("Prue",
                                     ignore_case = TRUE
                                     )
                               )
         )

tweet2 %>%
  select(text,
         countPrue
         )

tweet2 %>%
  mutate(extractThanks = str_extract(text,
                                     regex("thanks",
                                           ignore_case = TRUE
                                           )
                                     )
         )
tweet2 <- tweet2 %>%
  mutate(mentions = str_extract(text,
                                "@"
                                )
         )

tweet2 %>%
  select(id,
         text,
         mentions
         )
tweet <- tweet %>%
  mutate(mention = str_extract_all(text,
                                   regex("@\\w+"
                                         ),
                                   simplify = FALSE
                                   )
         )

head(tweet$mention,
     n = 50
     )
tweet$mention[9][1]
tweet <- tweet %>%
  unnest_wider(mention)

head(tweet)
tweet <- tweet %>%
  rename("mention1" = "...1",
         "mention2" = "...2",
         "mention3" = "...3",
         "mention4" = "...4",
         "mention5" = "...5",
         "mention6" = "...6",
         "mention7" = "...7",
         "mention8" = "...8"
         )

tweet
tweetWithMentions <- str_subset(tweet$text,
                                regex("@\\w+")
                                )

tweetWithMentions

tweet$text[13]

tweet <- tweet %>%
  mutate(text = str_replace_all(text,
                                regex("\\n",
                                      ignore_case = TRUE
                                      ),
                                ""
                                ),
         text = str_replace_all(text,
                                  regex(" +"),
                                  " "
                                )
         )


tweet$text[13]

tweet$text[9]

tweet <- tweet %>%
  mutate(cleanText = str_replace_all(text,
                                      regex("http\\S+\\s*"),
                                      ""
                                     ),
         cleanText = str_replace_all(cleanText,
                                     regex("#\\w+"),
                                     ""
                                     ),
         cleanText = str_replace_all(cleanText,
                                     regex("@\\w+"),
                                     ""
                                     ),
         cleanText = str_replace_all(cleanText,
                                     regex(" +"),
                                     " "
                                     )
         )

tweet$cleanText[9]
tweet <- tweet %>%
  mutate(cleanText = str_replace(cleanText,
                                 regex("^ "),
                                 ""
                                 ),
         cleanText = str_replace(cleanText,
                                 regex(" $"),
                                 ""
                                 )
         )

tweet$cleanText[9]