stringr-grammar.Rmd

title: "Textdaten bearbeiten mit **stringr**"
author: "B. Philipp Kleer"
date: "11. Oktober 2021"
output:
  html_document:
      toc: true
      toc_float:
        toc_collapsed: true
        smooth_scroll: true
      toc_depth: 3
      widescreen: true
      highlight: pygments
      theme: readable
      css: styles/style.css
      df_print: paged
      mathjax: default
      self_contained: false
      incremental: false #True  dann jedes Bullet einzeln
      collapse: true # means the text output will be merged into the R source code block
      includes:
        after_body: ./styles/footer.html
        before_body: ./styles/header.html
library("knitr")
library("rmarkdown")
library("tidyverse")

gbbo <- readRDS("../datasets/gbbo.rds")

opts_chunk$set(#fig.path = 'pics/s6-', # path for calculated figures
               fig.align = 'center',  # alignment of figure (also possible right, left, default)
               fig.show = 'hold', # how to show figures: hold -> direct at the end of code chunk; animate: all plots in an animation
               fig.width = 6,   # figure width
               fig.height = 6,  # figure height
               echo = TRUE,     # Code is printed
               eval = FALSE,    # Code is NOT evaluated
               warning = FALSE, # warnings are NOT displayed
               message = FALSE, # messages are NOT displayed
               size = "tiny",  # latex-size of code chunks
               background = "#E7E7E7", # background color of code chunks
               comment = "", # no hashtags before output
               options(width = 80),
               results = "markdown",
               rows.print = 15
)

htmltools::tagList(
  xaringanExtra::use_clipboard(
    button_text = "<i class=\"fa fa-clipboard\"></i>",
    success_text = "<i class=\"fa fa-check\" style=\"color: #90BE6D\"></i>",
    error_text = "<i class=\"fa fa-times-circle\" style=\"color: #F94144\"></i>"
  ),
  rmarkdown::html_dependency_font_awesome()
)
# install.packages("tidyverse")
library("tidyverse")

# alternativ:
# install.packages("stringr")
# library("stringr")
gbbo <- readRDS("../datasets/gbbo.rds")
 # oder eigenen Pfad, wenn nicht in Cloud
gbbo
tweet <- gbbo %>%
  select(text)

tweet$id <- seq(1,
                862,
                1
                )

head(tweet,
     n = 100
     )
str_length(tweet$text)

# oder für einen spezifischen
str_length(tweet$text[23])

# oder mit piping
tweet %>%
  filter(id == 275) %>%
  select(text) %>%
  str_length()

# hier der 23. tweet
str_sub(tweet$text[23],
        2,
        5
        )

tweet %>%
  filter(id == 23) %>%
  select(text) %>%
  str_sub(2,
          5
          )
# hier der 23. tweet
str_sub(tweet$text[23],
        25,
        25
        )

str_sub(tweet$text[23],
        -23,
        -2
        )
tweet$text[23]

str_sub(tweet$text[23],
        26,
        26
        ) <- "M"

tweet$text[23]
str_dup(tweet$text[23],
        4
        )
tweet %>%
  filter(id == 23) %>%
  select(text) %>%
  str_sub(1,
          8
          ) %>%
  str_dup(4)
partOfTweet  <- tweet$text[c(23,
                             731
                             )
                           ]

partOfTweet
partOfTweet <- str_pad(partOfTweet,
                       100,
                       side = "both"
                       )

partOfTweet
tweet <- tweet %>%
  str_pad(text,
          100,
          side = "both"
          )
tweet <- tweet %>%
  mutate(shortT = str_pad(text,
                          100,
                          side = "both"
                          )
         )

head(tweet$shortT,
     n = 50
     )
tweet <- tweet %>%
  mutate(truncT = str_trunc(text,
                            50
                            ),
         truncT = str_pad(truncT,
                          60,
                          side = "both"
                          )
         )

head(tweet$truncT,
     n = 50
     )