R Packages
we use in this sectionlibrary(DT)
library(gapminder)
library(gghighlight)
library(ggrepel)
library(stargazer)
library(tidyverse)
A scatter plot
is a common method of visualizing the
relationship between two continuous variables (variables measured on an
interval or ratio scale).Geometric Object | Meaning |
ggplot() |
Prepare the canvas for drawing the figure |
geom_point() |
Draw the scatter plot |
exp
)voteshare
)Download hr96-21.csv
df09
.exp
).voteshare
).exp
and
voteshare
.ggtitle()
function to add a main title.shape = number
.△
).df09 %>%
ggplot() +
geom_point(aes(x = exp,
y = voteshare),
color = "royalblue",
shape = 2) +
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 HR Election") +
theme_bw(base_family = "HiraKakuProN-W3")
solid circle
(●).Shapes | Details |
0〜14 | These are shapes with transparent insides and only outlines. |
15〜20 | These are solid shapes without outlines. |
21〜25 | The outline is adjusted with color, and the inside fill color is adjusted with fill. |
shape = 22
to display a square (□) with
‘violet’ inside and ‘aquamarine’ outline.df09 %>%
ggplot() +
geom_point(aes(x = exp,
y = voteshare),
color = "violet", # Specify the color of the outline
fill = "aquamarine", # Specify the color for the inside fill
shape = 22) + # Specify the shape of the dot
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 General Election") +
theme_bw(base_family = "HiraKakuProN-W3")
dimensions
) they represent.if_else()
function to create a
Democratic Party dummy variable dpj
and add a dimension to
the scatter plot.aes()
function, specify
shape = dpj
.shape = dpj
inside aes()
.seito
contains party name each candidate is affilated
with.民主
means the Democratic Party Japan in Japanese.df09 %>%
mutate(dpj = if_else(seito == "民主", "Democratic Party", "Non-Democratic Party")) %>%
ggplot() +
geom_point(aes(x = exp,
y = voteshare,
shape = dpj)) + # Differentiate dpj by the shape of the dot
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 HR Election") +
theme_bw(base_family = "HiraKakuProN-W3")
shape = dpj
, R automatically assigns
shapes like ‘solid circle’ (●) and ‘triangle’ (▲).scale_shape_manual()
.df09 %>%
mutate(dpj = if_else(seito == "民主", "Democratic Party", "Non-Democratic Party")) %>%
ggplot() +
geom_point(aes(x = exp,
y = voteshare,
shape = dpj)) + # Differentiate dpj by the shape of the dot
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 General Election") +
theme_bw(base_family = "HiraKakuProN-W3") +
theme(legend.position = "bottom") + # Position the legend at the bottom
scale_shape_manual(values = c("Democratic Party" = 1, # 'empty circle' is 1
"Non-Democratic Party" = 4)) # 'cross' is 4
aes()
function, specify
color = dpj
.df09 %>%
mutate(dpj = if_else(seito == "民主", "Democratic Party", "Non-Democratic Party")) %>%
ggplot() +
geom_point(aes(x = exp,
y = voteshare,
color = dpj, # Differentiate dpj by color
alpha = 0.5)) + # Add transparency
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 General Election") +
theme_bw(base_family = "HiraKakuProN-W3") +
theme(legend.position = "bottom") # Position the legend at the bottom
scale_color_manual()
layer.df09 %>%
mutate(dpj = if_else(seito == "民主", "Democratic Party", "Non-Democratic Party")) %>%
ggplot() +
geom_point(aes(x = exp,
y = voteshare,
color = dpj, # Differentiate dpj by color
alpha = 0.5)) + # Add transparency
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 General Election") +
theme_bw(base_family = "HiraKakuProN-W3") +
theme(legend.position = "bottom") + # Position the legend at the bottom
scale_color_manual(values = c("Democratic Party" = "blue",
"Non-Democratic Party" = "gold"))
colors()
in the console.[1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
[5] "antiquewhite2" "antiquewhite3"
#FF0000
, and royal
blue as #4169E1
.geom_smooth(method = lm)
.aes()
function inside
ggplot()
, and set the x axis, y axis, and color.plot_vs_09 <- df09 %>%
ggplot(aes(x = exp,
y = voteshare,
color = seito,
alpha = 0.5)) + # Specify the transparency of the dots
geom_point() +
geom_smooth(method = lm) + # Draw the regression line
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 HR Election") +
theme_bw(base_family = "HiraKakuProN-W3")
plot_vs_09
facet_wrap()
function.theme()
layer and specify
axis.text.x.
plot_vs_09 +
facet_wrap(~seito) + # Facet by political party
theme(legend.position = "none") + # Hide the legend
theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust = 1)) # Rotate by 35 degrees
seito
to a factor
before passing
it to ggplot()
to change the order.df09 %>%
mutate(seito = factor(seito,
levels = c("民主", "自民", "公明", "みんな",
"共産", "国民新党", "幸福", "新党日本",
"無所", "社民"))) %>%
ggplot(aes(x = exp,
y = voteshare,
color = seito,
alpha = 0.5)) + # Specify the transparency of the dots
geom_point() +
geom_smooth(method = lm) + # Draw the regression line
labs(x = "Election Expenses",
y = "Vote Share") +
ggtitle("Scatter Plot of Election Expenses and Vote Share: 2009 HR Election") +
theme_bw(base_family = "HiraKakuProN-W3") +
facet_wrap(~seito, ncol = 4) + # Display in 4 columns
theme(legend.position = "none") + # Hide the legend
theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust = 1)) # Rotate by 35 degrees
{gapminder}
data available in R
to introduce a method for visualizing information in more than three
dimensions on a two-dimensional plane.{gapminder}
package includes the following variables:変数名 | 詳細 |
---|---|
country | Country Name |
continent | Continent Name |
year | Year |
lifeExp | Life Expectancy |
pop | Population |
gdpPercap | GDP per Capita, in US dollars as of 2005 |
{r, results = "asis"}
in the chunk
options.Statistic | N | Mean | St. Dev. | Min | Max |
year | 1,704 | 1,979.50 | 17.27 | 1,952 | 2,007 |
lifeExp | 1,704 | 59.47 | 12.92 | 23.60 | 82.60 |
pop | 1,704 | 29,601,212.00 | 106,157,897.00 | 60,011 | 1,318,683,096 |
gdpPercap | 1,704 | 7,215.33 | 9,857.45 | 241.17 | 113,523.10 |
gapminder
}gdpPercap
)’ for the x axis and ‘Average Life Expectancy
(lifeExp
)’ for the y axis.ggplot()
, create a variable
pop_m
that converts the population into ‘millions of
people’.gapminder %>%
mutate(pop_m = pop / 1000000) %>% # Create variable pop_m, converting population to millions
ggplot() +
geom_point(aes(x = gdpPercap,
y = lifeExp)) +
labs(x = "GDP per Capita (USD)",
y = "Life Expectancy") +
theme_bw(base_family = "HiraKakuProN-W3")
x = log(gdpPercap)
.gapminder %>%
mutate(pop_m = pop / 1000000) %>%
ggplot() +
geom_point(aes(x = log(gdpPercap), # Log transform gdpPercap
y = lifeExp)) +
labs(x = "Logarithmic Value of GDP per Capita (USD)",
y = "Life Expectancy") +
theme_bw(base_family = "HiraKakuProN-W3")
gdpPercap
, a quite clear linear
relationship became apparent.# A tibble: 5 × 3
year country gdpPercap
<int> <fct> <dbl>
1 1952 Kuwait 108382.
2 1957 Kuwait 113523.
3 1962 Kuwait 95458.
4 1967 Kuwait 80895.
5 1972 Kuwait 109348.
aes()
to color-code by
continent.size = pop_m
inside aes()
to
synchronize the size of the dots with the population. - When the dots
get larger, they overlap and become difficult to see, so set the dots to
be semi-transparent (alpha = 0.5
).coord_trans(x = "log10")
.gapminder %>%
mutate(pop_m = pop / 1000000) %>%
ggplot() +
geom_point(aes(x = log(gdpPercap),
y = lifeExp,
color = continent,
size = pop_m),
alpha = 0.5) +
labs(x = "Logarithmic Value of GDP Per Capita (USD)",
y = "Life Expectancy",
size = "Population",
color = "Continent") +
theme_bw(base_family = "HiraKakuProN-W3")
geom_smooth()
to draw a fit line.gapminder %>%
mutate(pop_m = pop / 1000000) %>%
ggplot(aes(x = log(gdpPercap),
y = lifeExp,
col = continent,
size = pop_m)) +
geom_point(alpha = 0.5) +
labs(x = "Logarithmic Value of GDP Per Capita (USD)",
y = "Life Expectancy",
size = "Population",
color = "Continent") +
theme_bw(base_family = "HiraKakuProN-W3") +
geom_smooth()
gapminder %>%
mutate(pop_m = pop / 1000000) %>%
ggplot(aes(x = log(gdpPercap),
y = lifeExp,
col = continent,
size = pop_m)) +
geom_point(alpha = 0.5) +
labs(x = "Logarithmic Value of GDP Per Capita (USD)",
y = "Life Expectancy",
size = "Population",
color = "Continent") +
theme_bw(base_family = "HiraKakuProN-W3") +
geom_smooth(method = lm)
facet_wrap(~continent)
to draw separate scatter
plots for each continent.gapminder %>%
mutate(pop_m = pop / 1000000) %>%
ggplot(aes(x = log(gdpPercap),
y = lifeExp,
col = continent,
size = pop_m)) +
geom_point(alpha = 0.5) +
labs(x = "Logarithmic Value of GDP Per Capita (USD)",
y = "Life Expectancy",
size = "Population",
color = "Continent") +
theme_bw(base_family = "HiraKakuProN-W3") +
geom_smooth(method = lm) +
facet_wrap(~continent)
Insights from the gapminder Data from 1979
to 2007
・In the continents of Africa, Asia, and Europe, ‘GDP per capita’ has a
comparable impact on ‘Life Expectancy’.
・The continents where ‘GDP per capita’ has a significant impact on
‘Life Expectancy’ (i.e., where the slope is steeper) are the Americas
and Oceania.”
{gghighlight} package
.gapminder %>%
mutate(pop_m = pop / 1000000) %>%
ggplot(aes(x = log(gdpPercap),
y = lifeExp,
col = country,
size = pop_m)) +
geom_point(alpha = 0.5) +
gghighlight(country %in% c("Japan", "China", "United States"),
label_params = list(size = 3)) +
labs(x = "Logarithmic Value of GDP Per Capita (USD)",
y = "Life Expectancy",
size = "Population",
color = "Country") +
theme_bw(base_family = "HiraKakuProN-W3")
Conclusions ・In all these
countries, life expectancy increases as per capita GDP increases.
・In China, there is a sharp increase in life expectancy when GDP is
around 6%-7%, followed by a more gradual increase thereafter.
・In the United States (represented by blue dots), life expectancy also
increases as per capita GDP increases, but the life expectancy is
shorter than in Japan.
##5.1 Data Preparation
- In this section, we consider a scatter plot using data on eligible
voters aged 18 and the voting rate.
- Download the data from the 24th (2016) House of Councillors election
vote_18.csv
and save it in the RProject folder.
- Load the data.
Variable Name | Details |
---|---|
pref | Prefecture |
age18 | Voting rate of eligible voters aged 18 |
age19 | Voting rate of eligible voters aged 19 |
age1819 | Voting rate of eligible voters aged 18 and 19 |
all | Voting rate of the prefecture |
did | Population density of the prefecture |
Statistic | N | Mean | St. Dev. | Min | Max |
serial | 47 | 24.000 | 13.711 | 1 | 47 |
age18 | 47 | 48.389 | 5.307 | 35.290 | 62.230 |
age19 | 47 | 38.419 | 6.033 | 26.580 | 53.800 |
age1819 | 47 | 43.446 | 5.558 | 30.930 | 57.840 |
all | 47 | 54.968 | 3.887 | 45.520 | 62.860 |
did | 47 | 655.374 | 1,194.258 | 68.650 | 6,168.040 |
{DT} package
to display interactive descriptive
statistics of the data.if you just want to display the entire dataframe on the screen at
once, enter the command knitr::kable(hr2005)
.
Data sources: Data on the voting rate of 18 and 19-year-olds in
the 24th House of Councillors ordinary election: Ministry of Internal
Affairs and Communications, conducted on July 10, 2016 (Heisei
28).
The population density data (did
) is downloadabale
here Data as of
April 1, 2016
geom_text()
function to display prefecture
names.hc2016 %>%
ggplot(aes(did, all)) +
geom_point() +
stat_smooth(method = lm) +
geom_text(aes(y = all + 0.5,
label = pref),
size = 2,
family = "HiraKakuPro-W3") +
labs(x = "Urbanization index (did)", y = "Turnout") +
ggtitle("Voting rate by prefecture in the 2016 Japanese House of Councillors election") +
theme_bw(base_family = "HiraKakuProN-W3")
dplyr::filter(did < 2000)
.hc2016 %>%
filter(did < 2000) %>% # Exclude Tokyo, Osaka, Kanagawa
ggplot(aes(did, age18)) +
geom_point() +
stat_smooth(method = lm) +
ggrepel::geom_text_repel(aes(label = pref),
size = 2,
family = "HiraKakuPro-W3") +
labs(x = "Degree of Urbanization", y = "Voting Rate of 18-year-olds") +
ggtitle("Voting Rate in the 2016 House of Councillors Election (Excluding Tokyo, Osaka, Kanagawa)") +
theme_bw(base_family = "HiraKakuProN-W3")
- This makes the graph much easier to read.
- However, there is a downside of not including the three outliers.
→ To improve readability, transform the x-axis ‘Degree of
Urbanization (did)’ to logarithmic scale.
- Specify x = log(did).
hc2016 %>%
ggplot(aes(log(did), age18)) +
geom_point() +
stat_smooth(method = lm) +
geom_text(aes(y = age18 + 0.7,
label = pref),
size = 2,
family = "HiraKakuPro-W3") +
labs(x = "Degree of Urbanization (Log Transformed)", y = "Voting Rate of 18-year-olds") +
ggtitle("Voting Rate in the 2016 House of Councillors Election (Degree of Urbanization Log Transformed)") +
theme_bw(base_family = "HiraKakuProN-W3")
geom_text_repel()
function.hc2016 %>%
ggplot(aes(log(did), age18)) +
geom_point() +
stat_smooth(method = lm) +
ggrepel::geom_label_repel(aes(label = pref),
size = 2,
family = "HiraKakuPro-W3") +
labs(x = "Degree of Urbanization (Log Transformed)", y = "Voting Rate of 18-year-olds") +
ggtitle("Voting Rate in the 2016 House of Councillors Election (Degree of Urbanization Log Transformed)") +
theme_bw(base_family = "HiraKakuProN-W3")
Summary of the 24th House of Councillors
Election Data (2016) ・There is a very weak negative correlation
between ‘voting rate by prefecture’ and ‘degree of urbanization’.
→ Almost no correlation.
・However, there is a positive correlation between ‘voting rate of
18-year-olds’ and ‘degree of urbanization’”
hc2016 %>%
ggplot(aes(log(did), age18)) +
geom_point() +
geom_text(aes(y = age18 + 0.7,
label = pref),
size = 2,
family = "HiraKakuPro-W3") +
labs(x = "Degree of Urbanization (Log Transformed)",
y = "Voting Rate of 18-year-olds") +
ggtitle("Voting Rate of 18-years-olds:(2016 HC Election)") +
theme_bw(base_family = "HiraKakuProN-W3") +
gghighlight::gghighlight(
pref == "青森"|
pref == "秋田"|
pref == "岩手"|
pref == "山形"|
pref == "宮城"|
pref == "福島")
stat_smooth(method = lm)
is added, it’s possible to
simultaneously display the regression line for all prefectures and a
separate regression line just for the six Tohoku prefectures.hc2016 %>%
ggplot(aes(log(did), age18)) +
geom_point() +
stat_smooth(method = lm) +
geom_text(aes(y = age18 + 0.7,
label = pref),
size = 2,
family = "HiraKakuPro-W3") +
labs(x = "Degree of Urbanization (Log Transformed)", y = "Voting Rate of 18-year-olds") +
ggtitle("Voting Rate in the 2016 House of Councillors Election (Degree of Urbanization Log Transformed)") +
theme_bw(base_family = "HiraKakuProN-W3") +
gghighlight::gghighlight(
pref == "青森"|
pref == "秋田"|
pref == "岩手"|
pref == "山形"|
pref == "宮城"|
pref == "福島")
congress.csv
).congress
congress
.[1] "congress" "district" "state" "party" "name" "dwnom1" "dwnom2"
# A tibble: 6 × 7
congress district state party name dwnom1 dwnom2
<dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
1 80 0 USA Democrat TRUMAN -0.276 0.0160
2 80 1 ALABAMA Democrat BOYKIN F. -0.0260 0.796
3 80 2 ALABAMA Democrat GRANT G. -0.0420 0.999
4 80 3 ALABAMA Democrat ANDREWS G. -0.00800 1.00
5 80 4 ALABAMA Democrat HOBBS S. -0.0820 1.07
6 80 5 ALABAMA Democrat RAINS A. -0.170 0.870
# A tibble: 6 × 7
congress district state party name dwnom1 dwnom2
<dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
1 112 4 WISCONS Democrat MOORE -0.538 -0.458
2 112 5 WISCONS Republican SENSENBR 1.20 -0.438
3 112 6 WISCONS Republican PETRI 0.776 -0.00300
4 112 7 WISCONS Republican DUFFY 0.781 -0.270
5 112 8 WISCONS Republican RIBBLE 0.886 -0.193
6 112 1 WYOMING Republican LUMMIS 0.932 -0.211
filter()
function to extract data for each
Congress session.eighty <- congress %>%
filter(congress == 80) # 80th Congress
twelve <- congress %>%
filter(congress == 112) # 112th Congress
# A tibble: 6 × 7
congress district state party name dwnom1 dwnom2
<dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
1 80 0 USA Democrat TRUMAN -0.276 0.0160
2 80 1 ALABAMA Democrat BOYKIN F. -0.0260 0.796
3 80 2 ALABAMA Democrat GRANT G. -0.0420 0.999
4 80 3 ALABAMA Democrat ANDREWS G. -0.00800 1.00
5 80 4 ALABAMA Democrat HOBBS S. -0.0820 1.07
6 80 5 ALABAMA Democrat RAINS A. -0.170 0.870
eighty %>%
ggplot(aes(x = dwnom1, y = dwnom2)) +
geom_point(aes(color = party)) +
labs(x = "Economic Issues(dwnom1)",
y = "Racial Issues(dwnom2)") +
ggtitle("US 80th Congress") +
theme_bw(base_family = "HiraKakuProN-W3")
twelve %>%
ggplot(aes(x = dwnom1, y = dwnom2)) +
geom_point(aes(color = party)) +
labs(x = "Economic Issues(dwnom1)",
y = "Racial Issues(dwnom2)") +
ggtitle("US 112th Congress") +
theme_bw(base_family = "HiraKakuProN-W3")
Analysis Results of the 80th/121st US House of Representatives Survey Data (1947-2012)
1.3 Customizing the Shape of Dots
and draw a
scatter plot of ‘election expenses’ and ‘vote percentage’ in the 2009
House of Representatives election.shape = 23
to display a ◇
with
‘yellow’ inside and ‘magenta’ outline.2.2 Adding Dimension by Changing Dot Colors
and draw a scatter plot of ‘election expenses’ and ‘vote percentage’ in
the 2009 House of Representatives election.2.3 Specifying Dot Colors
and draw a scatter
plot of ‘election expenses’ and ‘vote percentage’ in the 2009 House of
Representatives election.3. Scatter Plot with Regression Line (1)
and draw a scatter plot of ‘election expenses’ and ‘vote percentage’ in
the 2005 House of Representatives election.facet_wrap()
function to display scatter plots
for each political party, ensuring that the Liberal Democratic Party and
the Democratic Party are displayed next to each other.4.5 Highlighting Specific Countries
using
{gapminder}
to display a scatter plot of ‘log value of GDP
per capita (USD)’ and ‘life expectancy’.5.2 Adding dimensions to a scatter plot
and draw a scatter plot for the 24th (2016) House of Councillors
election, with ‘population density of prefectures (did)’ on the x-axis
and ‘voting rate of 19-year-old voters (age19)’ on the y-axis.