getwd()
# Shows the working directory (wd)
#setwd(choose.dir()) # Select the working directory interactively
setwd("C:/Users/nt-lab/Documents/") # Changes the wd
savehistory(file="mylog.Rhistory")
mydata <- read.csv("C:/Users/nt-lab/Documents/mydata.csv", header=TRUE)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
mydata <- url("http://dss.princeton.edu/training/mydata.csv")
mydata <- read.csv(mydata,
header=TRUE)
mydata <- mydata[-4543:-nrow(mydata), ]
summary(mydata)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# String to numeric
mydata <-url("http://dss.princeton.edu/training/mydata.RData")
load(mydata)
#mydata <- url("http://dss.princeton.edu/training/mydata.csv")
#mydata <- read.csv(mydata,
header=TRUE)
mydata$year <-
sub('[:alnum:]','[:digit:]',mydata$year)
str(mydata$year)
mydata$year <-
as.numeric(mydata$year)
str(mydata$year)
mydata$gdppc
<- sub(',','\\1',mydata$gdppc)
str(mydata$gdppc)
mydata$gdppc
<- as.numeric(mydata$gdppc)
str(mydata$gdppc)
mydata$unemp
<- sub('%','\\1',mydata$unemp)
str(mydata$unemp)
mydata$unemp
<- as.numeric(mydata$unemp)
str(mydata$unemp)
summary(mydata)
# For more details type
?sub
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Creating variables from other variables
mydata <- url("http://dss.princeton.edu/training/mydata.csv")
mydata <- read.csv(mydata,
header=TRUE)
mydata$trade
<- mydata$exports + mydata$imports
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Replacing values within a variable
mydata$unempf
<- mydata$unempf*100
mydata$unempm
<- mydata$unempm*100
mydata$exports
<- mydata$exports/1000000
mydata$imports
<- mydata$imports/1000000
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
as.data.frame(levels(mydata$country))
#How many?
library(plyr)
mydata$id <-
id(mydata[c("country")], drop = TRUE)
mydata[c("id","country")]
summary(mydata[c("id","country")])
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
mydata <- url("http://dss.princeton.edu/training/mydata.csv")
mydata <- read.csv(mydata,
header=TRUE)
summary(mydata[c("unemp","unempf", "unempm")])
# All variables
summary(mydata)
str(mydata)
head(mydata)
tail(mydata)
head(mydata, n=25)
head(mydata)
tail(mydata)
mydata[18:30,
]
#######################################################
### GRAPHS
#######################################################
#install.packages("ggplot2")
#See
http://docs.ggplot2.org/
mydata$country.year
<- paste(mydata$country, as.character(mydata$year))
head(mydata)
row.names(mydata) <- paste(mydata$country,
as.character(mydata$year))
head(mydata)
plot(mydata$year,mydata$unemp, main="Unemployment", xlab="Year", ylab="Unemployment
rate")
identify(mydata$year,mydata$unemp,row.names(mydata)) #
Interactive identification
with(subset(mydata, unemp>37), text(year, unemp,
labels=country.year, pos=3))
######## TWO
GRAPHS SEPARATED
par(mfrow=c(1, 2))
with(subset(mydata,
country=="United States"), plot(year, unemp,
main="Unemployment, US", xlab="Year",
ylab="Unemployment rate", ylim=c(0,12)))
with(subset(mydata,
country=="United States"), lines(year, unemp,
lwd=2))
with(subset(mydata,
country=="United States"), abline(lm(unemp~year), col="red"))
grid()
with(subset(mydata,
country=="United Kingdom"), plot(year, unemp,
main="Unemployment, UK", xlab="Year",
ylab="Unemployment rate", ylim=c(0,12)))
with(subset(mydata,
country=="United Kingdom"), lines(year, unemp,
lwd=2))
with(subset(mydata,
country=="United Kingdom"), abline(lm(unemp~year), col="red"))
grid()
######## TWO
GRAPHS IN SAME REGION
with(subset(mydata,
country=="United States"), plot(year, unemp,
xlab="Year", ylab="Unemployment
rate", ylim=c(0,12)))
with(subset(mydata,
country=="United States"), lines(year, unemp,
lwd=2, col="green"))
par(new=TRUE) #Previous plot stays
with(subset(mydata,
country=="United Kingdom"), plot(year, unemp,
main="Unemployment", xlab="Year",
ylab="Unemployment rate", ylim=c(0,12), pch=8))
with(subset(mydata,
country=="United Kingdom"), lines(year, unemp,
lty=3, lwd=2,
col="red"))
grid()
legend(1995,2,c("US","UK"),lty=c(1,3),lwd=c(2,2),pch=c(1,8), col=c("green", "red"))
# To missing
mydata$unemp[mydata$unemp==0] <- NA
mydata$unempf[mydata$unempf==0] <- NA
mydata$unempm[mydata$unempm==0] <- NA
summary(mydata[c("unemp","unempf", "unempm")])
#############################################################################
##### DESCRIPTIVE STATISTICS
#############################################################################
mean(mydata$gdppc, na.rm=TRUE)
with(subset(mydata,
country=="United States"), mean(gdppc))
mean <-
aggregate(mydata[c("gdppc","unemp")],by=list(country.mean=mydata$country),mean,
na.rm=TRUE)
mean
attach(mean)
mean <- mean[order(gdppc),
]
detach(mean)
barplot(mean$gdppc, main="GDP pc (mean)",
horiz=TRUE,names.arg=mean$country.mean,
cex.names=0.5, las=1)
grid(,y)
mean.30k <- subset(mean, gdppc>30000)
bar.30k <- barplot(mean.30k$gdppc,
main="GDP pc (mean)",
horiz=TRUE,names.arg=mean.30k$country.mean,
col="lightgray",
cex.names=0.60, las=1)
grid(,y)
text(0, bar.30k, round(mean.30k$gdppc,1), pos=4) # text(x,y)
unemp.min <-tapply(mydata$unemp,mydata$country, min,
na.rm=TRUE)
unemp.max <-tapply(mydata$unemp,mydata$country, max,
na.rm=TRUE)
unemp.minmax <- round(cbind(unemp.min, unemp.max), digits=1)
unemp.minmax
# Export to
tab-separated format, Excel can read
write.table(unemp.minmax, file = "unempmimax.txt", sep = "\t")
summary(mydata[mydata$country=="United
States", ])
as.data.frame(sapply(mydata, mean,
na.rm=TRUE))
with(mydata, hist(polity2,
breaks="FD", col="green")) #Histogram
par(new=TRUE) #Previous plot won't be removed
density.polity2
<- density(mydata$polity2, na.rm=T)
plot(density.polity2, xlab="",
ylab="", main="", axes=FALSE)
table(mydata$polity2)
mydata$ones <-
1
polity2 <- table(mydata$ones, mydata$polity2)
prop.table(polity2,1)
# 1 column pct, 2 row pct
prop.table(polity2,1)*100 # 1 column pct, 2
row pct
round(prop.table(polity2,1)*100,2)
as.data.frame(round(100*prop.table(polity2,1),2))
##### RECODE
polity2
# 2 regime
categories
mydata$democracy
<- ifelse(mydata$polity2 >
6,c("democracy"), c("other"))
str(mydata$democracy)
table(mydata$country, mydata$democracy)
# 3 regime
categories
library(car)
mydata$regime
<- recode(mydata$polity2,
"-10:-6='Autocracy';
-5:6='Anocracy';
7:10='Democracy'")
str(mydata$regime)
mydata$regime
<-as.factor(mydata$regime)
str(mydata$regime)
table(mydata$country, mydata$regime)
country.regime <- table(mydata$country, mydata$regime)
country.regime
xtabs(~mydata$country + mydata$regime)
#Median not
affected by large values
mean(mydata$polity, na.rm=TRUE)
mean(mydata$polity2, na.rm=TRUE)
median(mydata$polity, na.rm=TRUE)
median(mydata$polity2, na.rm=TRUE)
### DECILES
#Generating deciles, useful to
create portfolios
gdppc.quantiles <- quantile(mydata$gdppc, probs=c(seq(0,1, by=0.10)))
mydata <- within(mydata,
gdppc.deciles <- cut(gdppc,
breaks = gdppc.quantiles, labels = 1:10,
include.lowest = TRUE))
mydata[c("country","year","gdppc","gdppc.deciles")]
head(mydata[c("country","year","gdppc","gdppc.deciles")],
n=25)
summary(mydata[c("id","country")])
mydata$country.year
<- paste(mydata$country, as.character(mydata$year))
row.names(mydata) <- paste(mydata$country,
as.character(mydata$year))
with(mydata, plot(exports, imports))
identify(mydata$year,mydata$unemp,row.names(mydata))
with(subset(mydata,
exports>1000000), text(imports, exports, labels=country.year,
pos=3))
mydata$country.year
<- paste(mydata$country, as.character(mydata$year))
row.names(mydata) <- paste(mydata$country,
as.character(mydata$year))
scatterplot(exports~imports, data=mydata)
scatterplot(exports~imports, id.method="identify",
data=mydata) #Uses identify to select cases
scatterplot(exports~imports, id.n=20,
boxplots=FALSE, data=mydata) #Uses identify to select
cases
# By group
scatterplot(exports~imports|democracy, data=mydata)
scatterplot(exports~imports|regime, data=mydata)
scatterplotMatrix(~ gdppc + unemp + trade + polity2, span=0.7, id.n=0,
data=mydata)
### Correlation
cor(mydata$gdppc,mydata$unemp, use="complete.obs") # ?cor for more
Pearson default
cor.test(mydata$gdppc,mydata$unemp, use="complete.obs") # ?cor for more
Pearson default
with(subset(mydata,
country=="United States"), cor.test(gdppc,unemp, use="complete.obs"))
with(subset(mydata,
country=="Mexico"), cor.test(gdppc,unemp, use="complete.obs"))
### REGRESSION
# See
http://dss.princeton.edu/training/RStata.pdf
reg1 <- lm( gdppc ~ unemp
+ trade + polity2, data=mydata)
reg1
summary(reg1)
gdppc.predicted <- fitted(reg1)
# predicted values
mydata.residuals <- residuals(reg1)
# residuals
as.data.frame(mydata.residuals)
### To add to
data frame
reg2 <- lm( gdppc ~ unemp
+ trade + polity2, data=mydata, na.action="na.exclude")
summary(reg2)
mydata$gdppc.pred
<- fitted(reg2)
residualPlots(reg1)
reg3 <- lm(gdppc ~ unemp
+ trade + factor(regime), data=mydata)
residualPlots(reg3)
reg4 <- lm(gdppc ~ log2(unemp) + (trade)^2 + factor(regime), data=mydata)
residualPlots(reg4)
#### Logit models
mydata$demos[mydata$polity2
> 6] <- 1
mydata$demos[mydata$polity2
<= 6] <- 0
table(mydata$demos)
table(mydata$democracy)
str(mydata$demos)
logit1 <- glm(demos
~ gdppc + unemp +
trade, data = mydata,
family = "binomial")
summary(logit1)
exp(coef(logit1))
exp(cbind(OR = coef(logit1), confint(logit1)))
# See
http://www.ats.ucla.edu/stat/r/dae/logit.htm
# Panel data see:
#
http://dss.princeton.edu/training/Panel101R.pdf
#SAVING DATA
save(mydata,file="mydata.RData")
# Saving data as *RData
# LOAD DATA FROM
WEB
mydata <-url("http://dss.princeton.edu/training/mydata.RData")
load(mydata)