|
Example SAS code for the Democracy Data Set
by way of Rafal Raciborski
******************************************
******* LINEAR REGRESSION examples *******
******************************************;
* let's focus on just one year, let's say 1999
* the easiest way to do it is to subset the dataset
* keeping only those obs for which year is 1999;
data work.Dem1999;
set tmp1.Democracy;
if year = 1999;
run;
* from now on, we will be working with Dem1999 dataset....;
* proc means summarizes the data;
proc means data = work.Dem1999 maxdec=2;
run;
* another important statement is proc corr which returns correlations
* between all the numerical vars in the dataset;
proc corr;
var DEMSCORE REGIONPOLITY RENT MUSLIM GDPPC;
run;
* you may also plot some vars together, e.g. let's see whether countries
with
* predominant Muslim population tend to be less democratic;
* (note SAS is not case-sensitive);
proc gplot;
plot demscore * muslim;
symbol value = square;
run;
* example of a nonlinear association;
* (note the correlation between these two vars!);
proc gplot;
plot demscore * gdppc;
run;
* finally, let's regress demscore on muslim;
* (the p option fetches predicted values);
proc reg;
model demscore = muslim / p;
plot demscore*muslim p.*muslim;
run;
* how do you interpret the coefficient on muslim?
* (remember that the variable muslim varies from 0 to .99);
*effect of outliers;
*let's change America's muslim score to 1.00;
data work.Dem1999c;
set work.Dem1999;
if country = 'United States' then muslim = 1;
run;
proc reg;
model demscore = muslim / p;
plot demscore*muslim p.*muslim;
run;
quit;
********************************
********* LECTURE 2 ************
*** LINEAR REGRESSION cont'd ***
********************************;
data work.Dem1999m;
set tmp1.Democracy;
mTwo = muslim*2;
mHalf = muslim*.5;
dsTwo = demscore*2;
dsHalf = demscore*.5;
if year = 1999;
run;
* correlation does NOT depend on units of measurement;
proc corr data=work.Dem1999m;
var demscore dsTwo dsHalf MUSLIM mTwo mHalf;
run;
proc reg;
model demscore = muslim;
run;
proc reg;
model demscore = mTwo;
run;
proc reg;
model demscore = mHalf;
run;
proc reg;
model dsTwo = muslim;
run;
proc reg;
model dsTwo = mTwo;
run;
* multivariate model, just to interpret t-values;
proc reg;
model demscore = muslim netoil income regionpolity elf85d econfree trade;
* examples of the TEST statement (with labels);
* netoil: test netoil = 0;
* netoil_trade: test netoil = 0, trade = 0; * (equivalently, test netoil,
trade);
* econ_trade: test econfree + trade = 1;
run;
quit;
****************************************
********* MULTICOLLINEARITY ************
****************************************;
* note: i am adding a tradeTwice var and a non-sensical var BUMP;
data work.Dem1999;
set tmp1.Democracy;
if year = 1999;
tradeTwice = 2*trade;
bump = ( 1.5*rent + .85*netoil ) / trade;
run;
proc corr;
var _NUMERIC_;
run;
proc corr best = 4;
var _NUMERIC_;
run;
proc corr;
var bump rent trade econfree;
run;
proc reg;
no_collin_trade: model demscore = econfree trade;
bump_not_correlated_with_trade: model demscore = econfree trade bump;
* example of perfect collinearity;
tradeTwice: model demscore = econfree trade tradeTwice;
no_collin_rent: model demscore = econfree trade rent;
* warning sign: you add a var to the model, and some other var drops in
significance;
bump_correlated_with_rent: model demscore = econfree trade rent bump;
run;
* similarly;
proc reg;
model demscore = econfree rent income gdppp; *then add GDPPP;
run;
* detecting multicollinearity: look for strong correlations between the
explanatory vars;
proc corr best=4; *BEST = n prints n highest correlation coefficients;
var _NUMERIC_;
run;
* detection cont'd: use the VIF and TOL options;
proc reg;
model demscore = econfree trade rent / vif tol;
run;
proc reg;
model demscore = econfree trade rent bump / vif tol;
run;
proc reg;
model demscore = econfree rent income gdppp / vif tol;
run;
proc corr;
var income gdppp;
run;
quit;
*******************************************
************ INTERACTION TERMS ************
************* AND LOOSE ENDS **************
*******************************************;
* substantive effects first (and logarithms)....;
data work.Dem1999;
set tmp1.Democracy;
if year = 1999;
run;
proc reg;
model demscore = econfree regionfh income; * interpret income! ;
run;
proc means data = work.Dem1999 maxdec=2;
var econfree regionfh income incomenolog;
run;
proc gplot;
plot demscore * incomenolog;
plot demscore * income;
run;
* do not run unreal counterfactuals, here we look at adjusting income
by region;
proc means data = work.Dem1999 maxdec=2;
var income incomenolog;
class geo;
run;
*******************;
*continue from here;
*******************;
* create some interaction terms;
data work.Dem1999;
set tmp1.Democracy;
if year = 1999;
inc2 = incomenolog**2;
run;
proc reg;
income: model demscore = econfree regionfh income / vif tol; * interpret
income! ;
inc2: model demscore = econfree regionfh incomenolog inc2 / vif tol;
run;
* more interaction terms....;
data work.Dem1999;
set tmp1.Democracy;
if year = 1999;
if demscore GE 8 then dem01 = 1;
if demscore LT 8 then dem01 = 0;
* robustness check: recode demscore in a different manner;
dem01elf = dem01*elf85d;
demElf = demscore*elf85d;
run;
proc corr best = 5;
var dem01 elf85d dem01elf;
var demscore elf85d demElf;
run;
*remember econfree is on an inverted scale!!!;
proc reg;
demElf: model econfree = demscore elf85d demElf income / vif tol;
dem01elf: model econfree = dem01 elf85d dem01elf income / vif tol;
noInterTerm: model econfree = dem01 elf85d income / vif tol;
run;
quit;
|