-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_sentiment_analysis.R
115 lines (82 loc) · 3.11 KB
/
twitter_sentiment_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# TEXT MINING & SENTIMENT ANALYSIS ----
# 1.0 LIBRARIES ----
# Text
library(tidytext) # Tidy text mining
library(textdata) # Needed for AFINN
# Visualization
library(plotly)
library(ggwordcloud) # Extension for wordclouds
# Core
library(tidyverse)
library(tidyquant)
# 2.0 TIDY TEXT ----
tweets_covid19 = read_rds("tweets_covid19.rds")
tweets_covid19 %>% glimpse()
# Tidy the data
tweets_tokenized_tbl <- tweets_covid19 %>%
select(text) %>%
rowid_to_column() %>%
unnest_tokens(word, text)
tweets_tokenized_tbl %>% glimpse()
tweets_tokenized_tbl %>% count(word, sort = TRUE)
# 3.0 SENTIMENT ANALYSIS ----
# 3.1 Sentiment Dictionaries
get_sentiments(lexicon = "bing") # CategoricalPositive / Negative
get_sentiments(lexicon = "afinn") # Assigns polarity
# 3.2 Joining Sentiment Dictionaries with Tokenized Text
sentiment_bing_tbl <- tweets_tokenized_tbl %>%
inner_join(get_sentiments("bing"))
sentiment_bing_tbl
# 3.3 Measuring Sentiment
# Overall Sentiment
sentiment_bing_tbl %>% count(sentiment)
# Sentiment by user
sentiment_by_row_id_tbl <- sentiment_bing_tbl %>%
select(-word) %>%
count(rowid, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
mutate(sentiment = positive - negative) %>%
left_join(
tweets_covid19 %>% select(screen_name, text) %>% rowid_to_column()
)
sentiment_by_row_id_tbl
# 4.0 POLARITY VISUALIZATION -----
label_wrap <- label_wrap_gen(width = 60)
data_formatted <- sentiment_by_row_id_tbl %>%
mutate(text_formatted = str_glue("Row ID: {rowid}
Screen Name: {screen_name}
Text:
{label_wrap(text)}"))
data_formatted
g <- data_formatted %>%
ggplot(aes(rowid, sentiment)) +
geom_line(color = "#2c3e50", alpha = 0.5) +
geom_point(aes(text = text_formatted), color = "#2c3e50") +
geom_smooth(method = "loess", span = 0.25, se = FALSE, color = "blue") +
geom_hline(aes(yintercept = mean(sentiment)), color = "blue") +
geom_hline(aes(yintercept = median(sentiment) + 1.96*IQR(sentiment)), color = "red") +
geom_hline(aes(yintercept = median(sentiment) - 1.96*IQR(sentiment)), color = "red") +
theme_tq() +
labs(title = "Sentiment Polarity", x = "Twitter User", y = "Sentiment")
g
ggplotly(g, tooltip = "text") %>%
layout(
xaxis = list(
rangeslider = list(type = "date")
)
)
# 5.0 WORDCLOUD -----
sentiment_by_word_tbl <- sentiment_bing_tbl %>%
count(word, sentiment, sort = TRUE)
sentiment_by_word_tbl
word_freq = sentiment_by_word_tbl %>%
slice(1:100) %>%
mutate(sentiment = factor(sentiment, levels = c("positive", "negative"))) %>%
ggplot(aes(label = word, color = sentiment, size = n)) +
geom_text_wordcloud_area() +
facet_wrap(~ sentiment, ncol = 2) +
theme_tq() +
scale_color_tq() +
scale_size_area(max_size = 12) +
labs(title = "Word Cloud")
word_freq