Tidy Text Analysis: Word frequencies & n-grams

Packages

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)
library(wordcloud2)
library(readtext)
library(friends)
library(tm) # for the Corpus function

Loading required package: NLP

Attaching package: 'NLP'

The following object is masked from 'package:ggplot2':

    annotate

library(jiebaR)

Loading required package: jiebaRD

library(png)

With demo data

demoFreqC |> 
  wordcloud2(color = "random-light", backgroundColor = "black")

demoFreqC |> 
  wordcloud2(minRotation = -pi/6, 
             maxRotation = -pi/6, 
             rotateRatio = 1,
             size = 0.5,
             shape = "circle",
             color = "random-light", 
             backgroundColor = "black")

demoFreq |> 
  slice_max(order_by = freq, n = 50) |> 
  wordcloud2(size = 1.5,
             shape = "circle")

More complex setup example using Friends data

docs <- friends |> 
  filter(speaker == "Ross Geller" ) |>
  select(text) |> 
  pull() |>
  VectorSource() |>
  Corpus() |> 
  tm_map(content_transformer(tolower)) |>
  tm_map(removePunctuation) |>
  tm_map(removeNumbers) |>
  tm_map(removeWords, stopwords("en")) |>
  tm_map(stripWhitespace)

Warning in tm_map.SimpleCorpus(Corpus(VectorSource(pull(select(filter(friends,
: transformation drops documents

Warning in
tm_map.SimpleCorpus(tm_map(Corpus(VectorSource(pull(select(filter(friends, :
transformation drops documents

Warning in
tm_map.SimpleCorpus(tm_map(tm_map(Corpus(VectorSource(pull(select(filter(friends,
: transformation drops documents

Warning in
tm_map.SimpleCorpus(tm_map(tm_map(tm_map(Corpus(VectorSource(pull(select(filter(friends,
: transformation drops documents

Warning in
tm_map.SimpleCorpus(tm_map(tm_map(tm_map(tm_map(Corpus(VectorSource(pull(select(filter(friends,
: transformation drops documents

docs1 <- friends |> 
  filter(speaker == "Ross Geller" ) |>
  select(text) |> 
  unnest_tokens(input = text, output = word) |> # split the text into words
  anti_join(stop_words) |> # remove stop words
  count(word, sort = TRUE) |> 
  filter(n > 10)

Joining with `by = join_by(word)`

docs1 |> 
  wordcloud2(size = 1.5,
             shape = "circle",
             color = "random-light", 
             backgroundColor = "black")

dtm <- TermDocumentMatrix(docs) |> 
  as.matrix()
words <- sort(rowSums(dtm), decreasing = TRUE) 
df <- data.frame(word = names(words), freq = words)
df |> 
  wordcloud2(size = .5,
             shape = "circle",
             color = "random-light", 
             backgroundColor = "black")

# save the wordcloud

Introduction to Text Analysis in R: David Caughlin

Lexicon-based sentiment analysis

db <- read_csv("SHRM Discussion Board.csv")

Rows: 46 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Thread, Text
dbl (1): Post

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

db_new <- db |> 
  unnest_tokens(input = Text, output = word) |> # split the text into words
  anti_join(stop_words) |> # remove stop words
  count(word, sort = TRUE) |> 
  filter(n > 10)

Joining with `by = join_by(word)`

db_new |> 
  wordcloud2(size = 1.5,
             shape = "circle",
             color = "random-light", 
             backgroundColor = "black")

shi <- read_csv("../ebooks/诗/诗经/shijing.csv")

New names:
Rows: 305 Columns: 6
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(5): h2, h3, title, hplinks, zhengwen dbl (1): ...1
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`

shi_new <- shi |> 
  select(zhengwen) |>
  pull() |>
  # 用jiebaR分词
  segment(worker()) |>
  tibble() |>
  rename(zhengwen = 1) |> 
  count(zhengwen, sort = TRUE)

shi_new |> 
  wordcloud2(size = 1.5,
             shape = "circle",
             color = "random-light", 
             backgroundColor = "black")

回到顶部