Introduction to text analysis using R and quanteda

Last updated on Jan 24, 2023 19 min read

Purpose

This post introduces the capabilities of the R package quantida for for managing and analyzing textual data. quanted has been developed by Kenneth Benoit, Kohei Watanabe, and other contributors (Benoit, Kenneth, Kohei Watanabe, Haiyan Wang, Paul Nulty, Adam Obeng, Stefan Müller, and Akitaka Matsuo. (2018) “quanteda: An R package for the quantitative analysis of textual data”. Journal of Open Source Software. 3(30), 774. https://doi.org/10.21105/joss.00774; See also:http://quanteda.io/index.html).

This post builds heavely on the quanteda tutorial: https://tutorials.quanteda.io/

Data

For demonstration, we will use the corpus of United States Presidential State of the Union Addresses available through the sotu-package

# install packages
# install.packages("quanteda")
# install.packages("quanteda.textstats")
# install.packages("quanteda.textplots")
# install.packages("rvest")
# install.packages("stringr")
# install.packages("devtools")
# devtools::install_github("quanteda/quanteda.tidy")
#United States Presidential State of the Union Addresses package sotu
#install.packages("sotu")

# load packages
library("quanteda")
library("rvest")
library("stringr")
library("quanteda.textstats")
library("quanteda.textplots")
library("quanteda.tidy")
library(sotu)
library(dplyr)

Acquiring text

meta <- sotu_meta 
glimpse(meta)

## Rows: 240
## Columns: 6
## $ X            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ president    <chr> "George Washington", "George Washington", "George Washing…
## $ year         <int> 1790, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 179…
## $ years_active <chr> "1789-1793", "1789-1793", "1789-1793", "1789-1793", "1793…
## $ party        <chr> "Nonpartisan", "Nonpartisan", "Nonpartisan", "Nonpartisan…
## $ sotu_type    <chr> "speech", "speech", "speech", "speech", "speech", "speech…

text <- sotu_text
glimpse(text)

##  chr [1:240] "Fellow-Citizens of the Senate and House of Representatives: \n\nI embrace with great satisfaction the opportuni"| __truncated__ ...

state_of_union <- cbind(meta, text)
glimpse(state_of_union)

## Rows: 240
## Columns: 7
## $ X            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ president    <chr> "George Washington", "George Washington", "George Washing…
## $ year         <int> 1790, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 1797, 179…
## $ years_active <chr> "1789-1793", "1789-1793", "1789-1793", "1789-1793", "1793…
## $ party        <chr> "Nonpartisan", "Nonpartisan", "Nonpartisan", "Nonpartisan…
## $ sotu_type    <chr> "speech", "speech", "speech", "speech", "speech", "speech…
## $ text         <chr> "Fellow-Citizens of the Senate and House of Representativ…

#Keep texts from Barak Obama and Donald Trump
data_sotu <- state_of_union %>%
  filter(president == "Barack Obama" | president =="Donald Trump")

glimpse(data_sotu)

## Rows: 12
## Columns: 7
## $ X            <int> 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240
## $ president    <chr> "Barack Obama", "Barack Obama", "Barack Obama", "Barack O…
## $ year         <int> 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 201…
## $ years_active <chr> "2009-2013", "2009-2013", "2009-2013", "2009-2013", "2013…
## $ party        <chr> "Democratic", "Democratic", "Democratic", "Democratic", "…
## $ sotu_type    <chr> "speech", "speech", "speech", "speech", "speech", "speech…
## $ text         <chr> "Madam Speaker, Mr. Vice President, Members of Congress, …

data_sotu$president

##  [1] "Barack Obama" "Barack Obama" "Barack Obama" "Barack Obama" "Barack Obama"
##  [6] "Barack Obama" "Barack Obama" "Barack Obama" "Donald Trump" "Donald Trump"
## [11] "Donald Trump" "Donald Trump"

Creating a text corpus

data_corpus <- corpus(data_sotu)

# check the number of documents included in the text corpus
ndoc(data_corpus)

## [1] 12

Tokenizing a corpus

Next, we tokenize our text corpus. Typically, tokenization involves separating texts by white spaces. We tokenize the text corpus without any pre-processing using tokens().

toks_sotu <- tokens(data_corpus)

# let's inspect the first six tokens of the first four documents
print(toks_sotu, max_ndoc = 4, max_ntoken = 6)

## Tokens consisting of 12 documents and 6 docvars.
## text1 :
## [1] "Madam"   "Speaker" ","       "Mr"      "."       "Vice"   
## [ ... and 6,737 more ]
## 
## text2 :
## [1] "Madam"     "Speaker"   ","         "Vice"      "President" "Biden"    
## [ ... and 8,145 more ]
## 
## text3 :
## [1] "Mr"      "."       "Speaker" ","       "Mr"      "."      
## [ ... and 7,735 more ]
## 
## text4 :
## [1] "Mr"      "."       "Speaker" ","       "Mr"      "."      
## [ ... and 7,830 more ]
## 
## [ reached max_ndoc ... 8 more documents ]

tokens(data_corpus)

## Tokens consisting of 12 documents and 6 docvars.
## text1 :
##  [1] "Madam"     "Speaker"   ","         "Mr"        "."         "Vice"     
##  [7] "President" ","         "Members"   "of"        "Congress"  ","        
## [ ... and 6,731 more ]
## 
## text2 :
##  [1] "Madam"         "Speaker"       ","             "Vice"         
##  [5] "President"     "Biden"         ","             "Members"      
##  [9] "of"            "Congress"      ","             "distinguished"
## [ ... and 8,139 more ]
## 
## text3 :
##  [1] "Mr"        "."         "Speaker"   ","         "Mr"        "."        
##  [7] "Vice"      "President" ","         "Members"   "of"        "Congress" 
## [ ... and 7,729 more ]
## 
## text4 :
##  [1] "Mr"        "."         "Speaker"   ","         "Mr"        "."        
##  [7] "Vice"      "President" ","         "Members"   "of"        "Congress" 
## [ ... and 7,824 more ]
## 
## text5 :
##  [1] "Please"    ","         "everybody" ","         "have"      "a"        
##  [7] "seat"      "."         "Mr"        "."         "Speaker"   ","        
## [ ... and 7,568 more ]
## 
## text6 :
##  [1] "The"       "President" "."         "Mr"        "."         "Speaker"  
##  [7] ","         "Mr"        "."         "Vice"      "President" ","        
## [ ... and 7,896 more ]
## 
## [ reached max_ndoc ... 6 more documents ]

# check number of tokens and types
toks_sotu %>%
    ntoken() %>%
    sum()

## [1] 86965

toks_sotu %>% 
    ntype() %>% 
    sum()

## [1] 21696

Without any pre-processing, the corpus consists of 86,965 tokens and 21,696 types.

Pre-processing

toks_sotu_pros <- toks_sotu %>% 
    tokens(remove_punct = TRUE) %>%
    tokens_tolower()

# check number of tokens and types
toks_sotu_pros %>% 
    ntoken() %>%
    sum()

## [1] 76779

toks_sotu_pros %>%
    ntype() %>%
    sum()

## [1] 20299

Keywords-in-context

We can use tokens objects to identify the occurrence of keywords and their immediate context.

kw_america <- kwic(toks_sotu, 
                   pattern = c("america"),
                   window = 2)

# number of mentions
nrow(kw_america)

## [1] 291

# print first 6 mentions of America and the context of ±2 words
head(kw_america, n = 6)

## Keyword-in-context with 6 matches.                                                     
##   [text1, 255]     States of | America | will emerge 
##   [text1, 325]     have made | America | the greatest
##  [text1, 1067] households in | America | will receive
##  [text1, 2698]    vision for | America | , as        
##  [text1, 3164]      time for | America | to lead     
##  [text1, 3339]     energy in | America | . That's

Text processing

We remove very frequent features and transform all words to lowercase. The code below shows how to adjust the object accordingly.

toks_sotu_pros <- data_corpus %>% 
    tokens(remove_punct = TRUE) %>% 
    tokens_remove(pattern = stopwords("en")) %>% 
    tokens_tolower()

New token object

Let’s inspect if the changes have been implemented as we expect by calling kwic() on the new tokens object.

kw_america_pros <- kwic(toks_sotu_pros, 
                             pattern = c("america"),
                             window = 2)

# print first 6 mentions of America and the context of ±2 words
head(kw_america_pros, n = 6)

## Keyword-in-context with 6 matches.                                                              
##   [text1, 109]      united states | america | emerge stronger 
##   [text1, 136]     qualities made | america | greatest force  
##   [text1, 458] working households | america | receive tax     
##  [text1, 1184]         see vision | america | blueprint future
##  [text1, 1404]        either time | america | lead thanks     
##  [text1, 1494]   renewable energy | america | need support

# test: print as table+
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kw_america_pros |> data.frame() %>%
  dplyr::select(Pre = pre, Keyword = keyword, Post = post, Pattern = pattern) %>%  
  kbl(booktabs = T) %>%
  kable_styling(latex_options = c("striped", "scale_down"), html_font = "Source Sans Pro", full_width = F)

Pre	Keyword	Post	Pattern
united states	america	emerge stronger	america
qualities made	america	greatest force	america
working households	america	receive tax	america
see vision	america	blueprint future	america
either time	america	lead thanks	america
renewable energy	america	need support	america
built right	america	speaking auto	america
cost easy	america	easy necessary	america
causes bankruptcy	america	every 30	america
promise education	america	global economy	america
make children	america	already made	america
goal 2020	america	highest proportion	america
powerful example	america	ordered closing	america
united states	america	torture can	america
begun know	america	meet threats	america
meet without	america	shun negotiating	america
enduring spirit	america	quit someday	america
united states	america	thank	america
progress inevitable	america	always destined	america
hesitations fears	america	prevailed chose	america
like across	america	2 years	america
united states	america	now house	america
wait long	america	put future	america
united states	america	hard may	america
kind energy	america	now grateful	america
global economy	america	must nation	america
support right	america	tonight set	america
million jobs	america	help meet	america
just competitors	america	sits sidelines	america
united states	america	one go	america
year 2000	america	budget surplus	america
hopeful future	america	world work	america
60 years	america	takes actions	america
corruption guinea	america	must always	america
values built	america	values allowed	america
united states	america		america
leadership made	america	just place	america
predicting decline	america	still largest	america
world make	america	best place	america
revolution can	america	better anyone	america
google facebook	america	innovation just	america
produce jobs	america	overseas also	america
many nations	america	fallen ninth	america
nation builders	america	time treated	america
end decade	america	highest proportion	america
future rebuilding	america	attract new	america
d better	america	nation built	america
every part	america	digital age	america
infrastructure make	america	better place	america
goal competitive	america	submit proposal	america
united states	america	stands people	america
days founding	america	story ordinary	america
things idea	america	endures destiny	america
united states	america		america
example think	america	within reach	america
educating people	america	attracts new	america
china meanwhile	america	productive weeks	america
choose stay	america	get hit	america
stay hire	america	third american	america
jobs right	america	send tax	america
level promise	america	always win	america
every family	america	able afford	america
jobs innovation	america	always new	america
can last	america	nearly 100	america
chemicals use	america	develop resource	america
already positioned	america	world's leading	america
next decade	america	less pollution	america
infrastructure much	america	needs rebuilt	america
owner rural	america	selling products	america
great depression	america	built hoover	america
handouts copouts	america	built last	america
reduce deficit	america	built last	america
united states	america	achieve lesson	america
united states	america	position strength	america
source attacks	america	tide war	america
let doubt	america	determined prevent	america
made clear	america	pacific power	america
moral example	america	back anyone	america
anyone tells	america	decline influence	america
rio opinions	america	higher years	america
every event	america	remains one	america
cops firefighters	america	strong defend	america
watching back	america	time look	america
united states	america		america
can know	america	moves forward	america
united states	america	tax reform	america
united states	america	american people	america
priority making	america	magnet new	america
making macs	america	things can	america
made right	america	can get	america
power capacity	america	generate even	america
new goal	america	cut half	america
partnership rebuild	america	attracts private	america
united states	america	start right	america
responsible homeowner	america	chance save	america
single child	america	something able	america
make sure	america	remains place	america
right away	america	better get	america
first job	america	place chance	america
hit towns	america	get communities	america
communities stronger	america	kind prosperity-	america
say confidence	america	complete mission	america
al qaida	america	continue lead	america
meet obligations	america	must also	america
reach see	america	must remain	america
voting experience	america	definitely needs	america
united states	america		america
americans today	america	teacher spent	america
part help	america	wean foreign	america
communities across	america	fathers mothers	america
place invest	america	believe can	america
breakthrough year	america	5 years	america
get ahead	america	now face	america
eager work	america	stand still	america
believe- believe-	america	success depend	america
can help	america	lead world	america
new jobs	america	past 5	america
tomorrow edge	america	surrender federally	america
working today	america	closer energy	america
selling truck	america	knew make	america
business leader	america	join us	america
us stronger	america	fields full	america
need right	america	need get	america
women succeed	america	succeeds now	america
incredible success	america	americans overwhelmingly	america
economy good	america	every mayor	america
state legislator	america	say wait	america
yes give	america	raise give	america
half parents	america	point lives	america
choice tell	america	differently see	america
foreign partners	america	must move	america
state knows	america	always side	america
strong confident	america	can negotiate	america
advantage opportunities	america	alliance europe	america
god bless	america	things help	america
loves like	america	serves sergeant	america
remind us	america	never come	america
every citizen	america	want kids-	america
kids- rising	america	honest work	america
united states	america		america
breakthrough year	america	economy growing	america
grateful service	america	endured grit	america
always propelled	america	forward 2	america
young love	america	get much	america
hard times	america	rebekah ben's	america
planet today	america	number one	america
oil gas	america	number one	america
young children	america	creating slots	america
every worker	america	opportunity earn	america
working people	america	raise now	america
upgrade skills	america	thrived 20th	america
idea across	america	2 years	america
free universal	america	high school	america
every ceo	america	let repeat	america
since 2010	america	put people	america
know want	america	know third	america
reward invest	america	use savings	america
growth competitiveness-	america	needs go	america
united states	america	question whether	america
question whether	america	leads world	america
terrorists threaten	america	iraq syria	america
well today	america	stands strong	america
economy tatters	america	leads bluster	america
iran secures	america	allies including	america
fails alienating	america	allies making	america
said liberal	america	conservative america	america
america conservative	america	black america	america
america black	america	white america	america
america white	america	united states	america
united states	america	said seen	america
office seen	america	best seen	america
mission building	america	going arguments	america
believe best	america	share broad	america
united states	america	want grow	america
work remaking	america	laid new	america
change accelerate	america	big changes	america
idea threatening	america	control time	america
third keep	america	safe lead	america
united states	america	right now	america
trends unique	america	offend uniquely	america
say people	america	going work	america
tackling poverty	america	giving everybody	america
practices across	america	part brighter	america
discovery dna	america	thomas edison	america
washington carver	america	grace hopper	america
sally ride	america	every immigrant	america
new moonshot	america	can cure	america
save make	america	country cures	america
together keep	america	safe strong	america
getting stronger	america	getting weaker	america
united states	america	powerful nation	america
united states	america	help remake	america
exactly year	america	led coalition	america
parts central	america	africa asia	america
power says	america	always act	america
products made	america	support good	america
good jobs	america	tpp china	america
back latin	america	restored diplomatic	america
trying weaken	america	democracy grinds	america
live now	america	want make	america
kindness helped	america	travel far	america
right worth	america	know country	america
united states	america	thank	america
states citizens	america	tonight mark	america
allies find	america	ready lead	america
foe find	america	strong america	america
america strong	america	proud america	america
america proud	america	free 9	america
history world	america	look like	america
crucial demand	america	must put	america
truly make	america	great dying	america
loved one	america	refused uphold	america
form inside	america	allow nation	america
ship products	america	many countries	america
ship products	america	charge nothing	america
going let	america	great companies	america
financially yet	america	enforce rule	america
national rebuilding	america	spent approximately	america
finally keep	america	safe must	america
braver fight	america	uniform blessed	america
kind friend	america	look heroes	america
expressing people	america	respects right	america
united states	america	know america	america
america know	america	better less	america
process rebuilding	america	willing find	america
can found	america	friends today	america
250th year	america	see world	america
ask made	america	greater ever	america
action now	america	empowered aspirations	america
future believe	america	thank god	america
mission make	america	great americans	america
strong proud	america	since election	america
350 billion	america	hire another	america
believe believe	america	can dream	america
american way	america	know faith	america
united states	america	want exciting	america
substantially watch	america	also finally	america
crumbling infrastructure	america	nation builders	america
family leave	america	regains strength	america
praying everyone	america	grieving please	america
security future	america	recent weeks	america
bill puts	america	first come	america
bring best	america	see vivid	america
sergeant peck	america	salutes terrorists	america
go friends	america	enemies america	america
america enemies	america	strengthen friendships	america
stay silent	america	stands people	america
threat pose	america	allies otto	america
labor returning	america	last june	america
place called	america	small cluster	america
people making	america	great long	america
god bless	america	goodnight	america
applause year	america	recognize two	america
20th century	america	saved freedom	america
can compete	america	applause now	america
earth far	america	applause america	america
america applause	america	winning every	america
following lead	america	nation believes	america
show world	america	committed ending	america
financial wellbeing	america	moral duty	america
truly make	america	safe work	america
defeat aids	america	beyond applause	america
wealthiest south	america	state abject	america
booo president	america	founded liberty	america
renew resolve	america	never socialist	america
chants death	america	threatens genocide	america
old knew	america	prevail cause	america
yet unborn	america	us everything	america
must keep	america	first hearts	america
god bless	america	thank much	america
3 years	america	now energy	america
two administrations	america	now gained	america
united states	america	indeed place	america
went serve	america	korea vietnam	america
job put	america	first next	america
high school	america	expand equal	america
aids epidemic	america	end decade	america
sick know	america	constantly achieving	america
new trees	america	around world	america
especially rural	america	better tomorrow	america
us keep	america	safe means	america
many cities	america	radical politicians	america
united states	america	sanctuary law-abiding	america
public schools	america	punish prayer	america
preachers pastors	america	celebrate faith	america
must remember	america	always frontier	america
pad ensure	america	first nation	america
witness tonight	america	land heroes	america
almighty god	america	place anything	america
can happen	america	place anyone	america
god bless	america	thank much	america

Identifying multiword expressions

The package quanteda.textstats includes the function textstat_collocation() that automatically retrieves common multiword expressions.

tstat_coll <- data_corpus %>% 
    tokens(remove_punct = TRUE) %>%
    tokens_remove(pattern = stopwords("en"), padding = TRUE) %>% 
    textstat_collocations(size = 2:3, min_count = 5)

#  select the first 20 collocations
head(tstat_coll, 20)

##           collocation count count_nested length   lambda        z
## 1         health care    55           44      2 8.096432 29.84438
## 2       united states    87           17      2 9.290649 29.57031
## 3           last year    46           36      2 5.714933 28.44516
## 4     american people    55           22      2 4.318872 26.54055
## 5           right now    41           29      2 4.946390 25.08424
## 6         high school    24           19      2 8.175795 22.86750
## 7           years ago    31           29      2 6.007038 22.75534
## 8           make sure    35           14      2 7.225183 22.35350
## 9        middle class    35           24      2 9.703782 22.22763
## 10         first time    27            6      2 5.187307 22.01583
## 11       clean energy    28           18      2 7.747080 21.62456
## 12           new jobs    36           24      2 4.073386 21.24003
## 13        took office    17            5      2 8.221849 20.39640
## 14   health insurance    19           16      2 7.193143 19.98062
## 15   small businesses    17           11      2 7.005867 19.52964
## 16           tax cuts    17           13      2 6.561422 19.20325
## 17          every day    18           16      2 5.335121 18.33243
## 18   working families    15            8      2 5.458811 17.86163
## 19     every american    27           17      2 3.837339 17.85800
## 20 immigration system    12            9      2 6.652498 17.73959

Document-feature matrix

Next, we transform our tokens object into a document-feature matrix (dfm). A dfm counts the occurrences of tokens in each document.

dfmat <- quanteda::dfm(toks_sotu_pros)

# most frequent features
topfeatures(dfmat, n = 10)

##  american       new   america       can       now    people        us     years 
##       310       301       291       280       279       275       246       225 
## americans      jobs 
##       220       214

# most frequent features by speaker

topfeatures(dfmat, groups = president, n = 10)

## $`Barack Obama`
##      can      new      now  america   people       us     jobs american 
##      222      217      214      203      196      184      179      177 
##    years     work 
##      161      155 
## 
## $`Donald Trump`
##  american  applause     thank   america       new    people       one   country 
##       133       117        92        88        84        79        78        77 
## americans   tonight 
##        76        73

Keyness

Keyness analysis allows compare frequencies of words between target and reference documents. textstat_keyness()identifies features that occur differentially across different categories – in our case, Obama’s and Trump’s speeches. The function textplot_keyness() provides a way of visualize the results of the keyness analysis.

Keyness Figure

tstat_key <- dfmat %>%
    quanteda::dfm_group(groups = president) %>%
    quanteda.textstats::textstat_keyness(target = "Donald Trump")

textplot_keyness(tstat_key)

## Scaling document positions

Wordfish is an unsupervised one-dimensional text scaling method (a Poisson scaling model of one-dimensional document positions), estimating the positions of documents solely based on the observed word frequencies. Here we estimate the ideological positions of speeches fro Barak Obama and Donald Trump.

library(quanteda.textmodels) 
library(quanteda.textplots)
 tmod_wf <- textmodel_wordfish(dfmat, dir = c(2, 1))

 # plot the Wordfish estimates by president
    textplot_scale1d(tmod_wf, groups = docvars(dfmat, "president"))

Topic model

We estimate a topic model for repectively Barak Obama and domnald Trump.

library(topicmodels)

dfmat_obama <- dfmat %>%
  quanteda::dfm_subset(president %in% c("Barack Obama"))
dfmat_trump <- dfmat %>%
  quanteda::dfm_subset(president  %in% c("Donald Trump"))
                                   
tmod_lda_obama <- LDA(dfmat_obama, k = 10, method="Gibbs", control=list(seed=1948))  
terms(tmod_lda_obama, 10)

##       Topic 1       Topic 2          Topic 3       Topic 4        Topic 5     
##  [1,] "ones"        "plan"           "pay"         "unemployment" "going"     
##  [2,] "skills"      "budget"         "control"     "important"    "lot"       
##  [3,] "students"    "health"         "built"       "cory"         "political" 
##  [4,] "development" "banks"          "regulations" "workforce"    "love"      
##  [5,] "prepare"     "cost"           "higher"      "u.s.a"        "democracy" 
##  [6,] "doubt"       "care"           "receive"     "son"          "especially"
##  [7,] "poverty"     "recovery"       "return"      "depend"       "alone"     
##  [8,] "efforts"     "responsibility" "gas"         "ask"          "isil"      
##  [9,] "others"      "crisis"         "brought"     "worker"       "gives"     
## [10,] "process"     "largest"        "dignity"     "iran's"       "planet"    
##       Topic 6         Topic 7    Topic 8    Topic 9      Topic 10   
##  [1,] "wage"          "spending" "can"      "none"       "past"     
##  [2,] "opportunities" "race"     "new"      "week"       "want"     
##  [3,] "achieve"       "goal"     "now"      "incentives" "sick"     
##  [4,] "5"             "come"     "america"  "trillion"   "rate"     
##  [5,] "need"          "level"    "people"   "exports"    "diplomacy"
##  [6,] "15"            "dream"    "us"       "human"      "issues"   
##  [7,] "lower"         "remember" "jobs"     "wall"       "networks" 
##  [8,] "rising"        "nation"   "american" "union"      "earth"    
##  [9,] "changing"      "internet" "years"    "income"     "forces"   
## [10,] "skills"        "willing"  "work"     "hundred"    "childcare"

tmod_lda_trump <- LDA(dfmat_trump, k = 10, method="Gibbs", control=list(seed=1948))  
terms(tmod_lda_trump,10)

##       Topic 1       Topic 2      Topic 3   Topic 4    Topic 5     Topic 6     
##  [1,] "learn"       "vice"       "space"   "applause" "days"      "national"  
##  [2,] "honor"       "soon"       "deliver" "usa"      "sanctuary" "foreign"   
##  [3,] "rebecca"     "countries"  "got"     "put"      "3"         "obamacare" 
##  [4,] "greatest"    "republican" "may"     "secure"   "reached"   "department"
##  [5,] "well"        "signed"     "lead"    "wall"     "aliens"    "insurance" 
##  [6,] "credit"      "crime"      "making"  "confront" "thanks"    "allow"     
##  [7,] "perhaps"     "continue"   "poverty" "whether"  "best"      "truly"     
##  [8,] "legislation" "outdated"   "york"    "century"  "lowest"    "community" 
##  [9,] "senator"     "steps"      "tariffs" "prison"   "u.s"       "ryan"      
## [10,] "increases"   "change"     "culture" "grace"    "take"      "us"        
##       Topic 7     Topic 8     Topic 9    Topic 10   
##  [1,] "american"  "invest"    "building" "look"     
##  [2,] "thank"     "crossings" "ryan"     "taking"   
##  [3,] "america"   "restoring" "veterans" "longer"   
##  [4,] "new"       "israel"    "job"      "walls"    
##  [5,] "country"   "heroic"    "serve"    "something"
##  [6,] "one"       "250th"     "kind"     "liberty"  
##  [7,] "tonight"   "women"     "stands"   "capitol"  
##  [8,] "people"    "energy"    "ms-13"    "happen"   
##  [9,] "americans" "helping"   "live"     "childhood"
## [10,] "now"       "thrive"    "strength" "black"

Viisualize topic model

cf. https://www.tidytextmining.com/topicmodeling.html

library(tidytext)
obama_topics <- tidy(tmod_lda_obama, matrix = "beta")
library(ggplot2)
library(dplyr)
terms_per_topic <- 10
obama_top_terms <- obama_topics %>%
#    filter(topic==6 | topic==8) %>%
    group_by(topic) %>%
    top_n(terms_per_topic, beta) %>%
    ungroup() %>%
    arrange(topic, -beta)
# top_n() doesn't handle ties -__- so just take top 10 manually
obama_top_terms <- obama_top_terms %>%
    group_by(topic) %>%
    slice(1:terms_per_topic) %>%
    ungroup()

obama_top_terms$topic <- factor(obama_top_terms$topic)

obama_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ topic, scales = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip()