getwd() # Shows the working directory (wd)

#setwd(choose.dir()) # Select the working directory interactively

setwd("C:/Users/nt-lab/Documents/") # Changes the wd

savehistory(file="mylog.Rhistory")

mydata <- read.csv("C:/Users/nt-lab/Documents/mydata.csv", header=TRUE)

 

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 

# Deleting rows

 

mydata <- url("http://dss.princeton.edu/training/mydata.csv")

mydata <- read.csv(mydata, header=TRUE)

 

mydata <- mydata[-4543:-nrow(mydata), ]

summary(mydata)

 

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 

# String to numeric

 

mydata <-url("http://dss.princeton.edu/training/mydata.RData")

load(mydata)

 

#mydata <- url("http://dss.princeton.edu/training/mydata.csv")

#mydata <- read.csv(mydata, header=TRUE)

 

mydata$year <- sub('[:alnum:]','[:digit:]',mydata$year)

str(mydata$year)

mydata$year <- as.numeric(mydata$year)

str(mydata$year)

 

mydata$gdppc <- sub(',','\\1',mydata$gdppc)

str(mydata$gdppc)

mydata$gdppc <- as.numeric(mydata$gdppc)

str(mydata$gdppc)

 

mydata$unemp <- sub('%','\\1',mydata$unemp)

str(mydata$unemp)

mydata$unemp <- as.numeric(mydata$unemp)

str(mydata$unemp)

 

summary(mydata)

 

# For more details type

 

?sub

 

 

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 

# Creating variables from other variables

 

mydata <- url("http://dss.princeton.edu/training/mydata.csv")

mydata <- read.csv(mydata, header=TRUE)

 

mydata$trade <- mydata$exports + mydata$imports

 

 

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 

# Replacing values within a variable

 

 

mydata$unempf <- mydata$unempf*100

mydata$unempm <- mydata$unempm*100

 

mydata$exports <- mydata$exports/1000000

mydata$imports <- mydata$imports/1000000

 

 

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 

# Counting panels

 

as.data.frame(levels(mydata$country)) #How many?

 

library(plyr)

mydata$id <- id(mydata[c("country")], drop = TRUE)

mydata[c("id","country")]

summary(mydata[c("id","country")])

 

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 

 

# Selected summary

 

mydata <- url("http://dss.princeton.edu/training/mydata.csv")

mydata <- read.csv(mydata, header=TRUE)

 

summary(mydata[c("unemp","unempf", "unempm")])

 

# All variables

 

summary(mydata)

str(mydata)

head(mydata)

tail(mydata)

head(mydata, n=25)

head(mydata)

tail(mydata)

mydata[18:30, ]

 

 

 

 

#######################################################

### GRAPHS

#######################################################

#install.packages("ggplot2")

#See http://docs.ggplot2.org/

 

mydata$country.year <- paste(mydata$country, as.character(mydata$year))

head(mydata)

row.names(mydata) <- paste(mydata$country, as.character(mydata$year))

head(mydata)

 

plot(mydata$year,mydata$unemp, main="Unemployment", xlab="Year", ylab="Unemployment rate")

 

identify(mydata$year,mydata$unemp,row.names(mydata)) # Interactive identification

 

with(subset(mydata, unemp>37), text(year, unemp, labels=country.year, pos=3))

 

######## TWO GRAPHS SEPARATED

 

par(mfrow=c(1, 2))

 

with(subset(mydata, country=="United States"), plot(year, unemp, main="Unemployment, US", xlab="Year", ylab="Unemployment rate", ylim=c(0,12)))

with(subset(mydata, country=="United States"), lines(year, unemp, lwd=2))

with(subset(mydata, country=="United States"), abline(lm(unemp~year), col="red"))

grid()

 

with(subset(mydata, country=="United Kingdom"), plot(year, unemp, main="Unemployment, UK", xlab="Year", ylab="Unemployment rate", ylim=c(0,12)))

with(subset(mydata, country=="United Kingdom"), lines(year, unemp, lwd=2))

with(subset(mydata, country=="United Kingdom"), abline(lm(unemp~year), col="red"))

grid()

 

######## TWO GRAPHS IN SAME REGION

 

with(subset(mydata, country=="United States"), plot(year, unemp, xlab="Year", ylab="Unemployment rate", ylim=c(0,12)))

with(subset(mydata, country=="United States"), lines(year, unemp, lwd=2, col="green"))

 

par(new=TRUE) #Previous plot stays

 

with(subset(mydata, country=="United Kingdom"), plot(year, unemp, main="Unemployment", xlab="Year", ylab="Unemployment rate", ylim=c(0,12), pch=8))

with(subset(mydata, country=="United Kingdom"), lines(year, unemp, lty=3, lwd=2, col="red"))

grid()

legend(1995,2,c("US","UK"),lty=c(1,3),lwd=c(2,2),pch=c(1,8), col=c("green", "red"))

 

# To missing

 

mydata$unemp[mydata$unemp==0] <- NA

mydata$unempf[mydata$unempf==0] <- NA

mydata$unempm[mydata$unempm==0] <- NA

 

summary(mydata[c("unemp","unempf", "unempm")])

 

#############################################################################

##### DESCRIPTIVE STATISTICS

#############################################################################

 

mean(mydata$gdppc, na.rm=TRUE)

with(subset(mydata, country=="United States"), mean(gdppc))

 

mean <- aggregate(mydata[c("gdppc","unemp")],by=list(country.mean=mydata$country),mean, na.rm=TRUE)

mean

 

attach(mean)

mean <- mean[order(gdppc), ]

detach(mean)

 

barplot(mean$gdppc, main="GDP pc (mean)",

horiz=TRUE,names.arg=mean$country.mean, cex.names=0.5, las=1)

grid(,y)

 

mean.30k <- subset(mean, gdppc>30000)

 

bar.30k <- barplot(mean.30k$gdppc, main="GDP pc (mean)",

horiz=TRUE,names.arg=mean.30k$country.mean,

col="lightgray",

cex.names=0.60, las=1)

grid(,y)

text(0, bar.30k, round(mean.30k$gdppc,1), pos=4) # text(x,y)

 

unemp.min <-tapply(mydata$unemp,mydata$country, min, na.rm=TRUE)

unemp.max <-tapply(mydata$unemp,mydata$country, max, na.rm=TRUE)

unemp.minmax <- round(cbind(unemp.min, unemp.max), digits=1)

unemp.minmax

 

# Export to tab-separated format, Excel can read

 

write.table(unemp.minmax, file = "unempmimax.txt", sep = "\t")

 

summary(mydata[mydata$country=="United States", ])

 

as.data.frame(sapply(mydata, mean, na.rm=TRUE))

 

with(mydata, hist(polity2, breaks="FD", col="green")) #Histogram

par(new=TRUE) #Previous plot won't be removed

density.polity2 <- density(mydata$polity2, na.rm=T)

plot(density.polity2, xlab="", ylab="", main="", axes=FALSE)

 

table(mydata$polity2)

 

mydata$ones <- 1

polity2 <- table(mydata$ones, mydata$polity2)

prop.table(polity2,1)  # 1 column pct, 2 row pct

prop.table(polity2,1)*100  # 1 column pct, 2 row pct

round(prop.table(polity2,1)*100,2)

as.data.frame(round(100*prop.table(polity2,1),2))

 

##### RECODE polity2

 

# 2 regime categories

mydata$democracy <- ifelse(mydata$polity2 > 6,c("democracy"), c("other"))

str(mydata$democracy)

 

table(mydata$country, mydata$democracy)

 

# 3 regime categories

 

library(car)

mydata$regime <- recode(mydata$polity2,

                       "-10:-6='Autocracy';

                        -5:6='Anocracy';

                        7:10='Democracy'")

str(mydata$regime)

mydata$regime <-as.factor(mydata$regime)

str(mydata$regime)

 

table(mydata$country, mydata$regime)

 

country.regime <- table(mydata$country, mydata$regime)

country.regime

 

xtabs(~mydata$country + mydata$regime)

 

#Median not affected by large values

 

mean(mydata$polity, na.rm=TRUE)

mean(mydata$polity2, na.rm=TRUE)

median(mydata$polity, na.rm=TRUE)

median(mydata$polity2, na.rm=TRUE)

 

### DECILES

#Generating deciles, useful to create portfolios

 

gdppc.quantiles <- quantile(mydata$gdppc, probs=c(seq(0,1, by=0.10)))

mydata <- within(mydata, gdppc.deciles <- cut(gdppc, breaks = gdppc.quantiles, labels = 1:10,

                                     include.lowest = TRUE))

 

mydata[c("country","year","gdppc","gdppc.deciles")]

head(mydata[c("country","year","gdppc","gdppc.deciles")], n=25)

summary(mydata[c("id","country")])

 

mydata$country.year <- paste(mydata$country, as.character(mydata$year))

row.names(mydata) <- paste(mydata$country, as.character(mydata$year))

with(mydata, plot(exports, imports))

identify(mydata$year,mydata$unemp,row.names(mydata))

with(subset(mydata, exports>1000000), text(imports, exports, labels=country.year, pos=3))

 

mydata$country.year <- paste(mydata$country, as.character(mydata$year))

row.names(mydata) <- paste(mydata$country, as.character(mydata$year))

 

scatterplot(exports~imports, data=mydata)

scatterplot(exports~imports, id.method="identify", data=mydata) #Uses identify to select cases

scatterplot(exports~imports, id.n=20, boxplots=FALSE, data=mydata) #Uses identify to select cases

 

# By group

 

scatterplot(exports~imports|democracy, data=mydata)

scatterplot(exports~imports|regime, data=mydata)

 

scatterplotMatrix(~ gdppc + unemp + trade + polity2, span=0.7, id.n=0, data=mydata)

 

### Correlation

 

cor(mydata$gdppc,mydata$unemp, use="complete.obs")  # ?cor for more Pearson default

cor.test(mydata$gdppc,mydata$unemp, use="complete.obs")  # ?cor for more Pearson default

 

with(subset(mydata, country=="United States"), cor.test(gdppc,unemp, use="complete.obs"))

with(subset(mydata, country=="Mexico"), cor.test(gdppc,unemp, use="complete.obs"))

 

### REGRESSION

# See http://dss.princeton.edu/training/RStata.pdf

 

reg1 <- lm( gdppc ~ unemp + trade + polity2, data=mydata)

reg1

summary(reg1)

gdppc.predicted <- fitted(reg1) # predicted values

mydata.residuals <- residuals(reg1) # residuals

as.data.frame(mydata.residuals)

 

### To add to data frame

reg2 <- lm( gdppc ~ unemp + trade + polity2, data=mydata, na.action="na.exclude")

summary(reg2)

mydata$gdppc.pred <- fitted(reg2)

 

residualPlots(reg1)

 

reg3 <- lm(gdppc ~ unemp + trade + factor(regime), data=mydata)

residualPlots(reg3)

 

reg4 <- lm(gdppc ~ log2(unemp) + (trade)^2 + factor(regime), data=mydata)

residualPlots(reg4)

 

#### Logit models

 

mydata$demos[mydata$polity2 > 6] <- 1

mydata$demos[mydata$polity2 <= 6] <- 0

 

table(mydata$demos)

table(mydata$democracy)

str(mydata$demos)

 

logit1 <- glm(demos ~ gdppc + unemp + trade,  data = mydata, family = "binomial")

summary(logit1)

exp(coef(logit1))

exp(cbind(OR = coef(logit1), confint(logit1)))

 

# See http://www.ats.ucla.edu/stat/r/dae/logit.htm

 

# Panel data see:

# http://dss.princeton.edu/training/Panel101R.pdf

 

#SAVING DATA

 

save(mydata,file="mydata.RData") # Saving data as *RData

 

# LOAD DATA FROM WEB

 

mydata <-url("http://dss.princeton.edu/training/mydata.RData")

load(mydata)