Example Programs‎ > ‎

Data Management

Below is a comparison of the commands used for common data management tasks in R, SAS, SPSS and Stata. The variables gender and workshop are categorical factors and q1 to q4, pretest and posttest are considered continuous and normally distributed. The practice data set is shown here. The programs and the data they use are also available for download here. Examples that are missing in the Stata column reflect differences between the two books. We will get around to those when we write the second edition of R for Stata Users.

 

Aggregating or Summarizing or
Collapsing Data Sets


 R SAS SPSS Stata
setwd("c:/myRfolder")
load(file = "mydata.RData")
attach(mydata)

# The aggregate Function.

# Means by gender.
myAgg1 <- aggregate(q1,
  by=data.frame(gender),
  mean, na.rm = TRUE)
myAgg1

# Now by workshop and gender.
myAgg2 <- aggregate(q1,
  by = data.frame(workshop,
  gender),
  mean, na.rm=TRUE)
myAgg2
mode(myAgg2)
class(myAgg2)

# Aggregation with tapply.

myAgg2 <- tapply(q1,
  data.frame(workshop, gender),
  mean, na.rm = TRUE)
myAgg2
class(myAgg2)
mode(myAgg2)

myAgg2 <- tapply(q1,
  data.frame(workshop, gender),
  range, na.rm = TRUE)
myAgg2
mode(myAgg1)
class(myAgg2)
myAgg2[[1]]

# Example multi-level transformation.

mydata$Zq1 <- (q1 - mean(q1) ) / sd(q1)
mydata

mySubset <- mydata[ q1 < mean(q1), ]
mySubset

# Rename x to be mean.q1.
library("plyr")
myAgg3 <- rename(myAgg2,
  c(x = "mean.q1") )
myAgg3

# Now merge means back with mydata.
mydata2 <- merge(mydata, myAgg3,
  by=c("workshop", "gender") )
mydata2

# Tables of Counts
table(workshop)
table(gender, workshop)
myCounts <- table(gender, workshop)
mode(myCounts)
class(myCounts)

# Counts in Summary or
# Aggregate style.
myCountsDF <- as.data.frame(myCounts)
myCountsDF
class(myCountsDF)
* Get means of q1 for each gender;
PROC SUMMARY DATA=myLib.mydata MEAN NWAY;
  CLASS GENDER;
  VAR q1;
  OUTPUT OUT=myLib.myAgg;
  RUN;
PROC PRINT; RUN;
DATA myLib.myAgg;
  SET myLib.myAgg;
  WHERE _STAT_='MEAN'’MEAN’;
  KEEP gender q1;
RUN;
PROC PRINT; RUN;

*Get means of q1 by workshop and gender;
PROC SUMMARY DATA=myLib.mydata MEAN NWAY;
CLASS WORKSHOP GENDER;
VAR Q1;
OUTPUT OUT=myLib.myAgg;RUN;
PROC PRINT; RUN;

*Strip out just the mean and matching variables;
DATA myLib.myAgg;
  SET myLib.myAgg;
  WHERE _STAT_='MEAN';
  KEEP workshop gender q1;
  RENAME q1=meanQ1;
RUN;
PROC PRINT; RUN;

*Now merge aggregated data back into mydata;
PROC SORT DATA=myLib.mydata;
  BY workshop gender; RUN:
PROC SORT DATA=myLib.myAgg;
  BY workshop gender; RUN:
DATA myLib.mydata2;
  MERGE myLib.mydata myLib.myAgg;
  BY workshop gender;
PROC PRINT; RUN;
GET FILE='mydata.sav'.

AGGREGATE
  /OUTFILE='myAgg.sav'
  /BREAK=gender
  /q1_mean = MEAN(q1).
GET FILE='myAgg.sav'.
LIST.

* Get mean of q1 by workshop
  and gender.
GET FILE='mydata.sav'.
AGGREGATE
  /OUTFILE='myAgg.sav'.
  /BREAK=workshop gender
  /q1_mean = MEAN(q1).
GET FILE='myAgg.sav'.
LIST.

* Merge aggregated data
  back into mydata.
* This step can be saved
* by using
* MODE=ADDVARIABLES
* in the previous step.
GET FILE='mydata.sav'.
SORT CASES BY  workshop (A) gender (A) .
MATCH FILES /FILE=*
 /TABLE='\myAgg.sav'
 /BY workshop gender.
SAVE OUTFILE='mydata.sav'.
* Filename: Collapse.do

use c:\myRfolder\mydata
* Get means of q1 for each gender
tabstat q1, by(gender)

* Get means of q1 by workshop and gender;
tabulate outwork gender, summarize(q1) means

* Strip out just the mean and matching variables;
collapse (mean) workshop gender, by(q1)

* Merge aggregated data back into mydata.
* mydata assumed to be sorted by q1
sort q1
merge q1 using mydata




Applying Functions
- The other packages have functions that go across the values of an observation, and procedures or commands that go down through observations. R has only commands, and which way they go can be changed by how they are "applied."

 R SAS SPSS Stata
load(file = "mydata.RData")
mydata
attach(mydata)

# Mean of the q variables
mean(mydata[3:6], na.rm = TRUE)

# Create mymatrix.
mymatrix <- as.matrix( mydata[ ,3:6] )
mymatrix

# Get mean of whole matrix.
mean(mymatrix, na.rm = TRUE)

# Get mean of matrix columns.
apply(mymatrix, 2, mean, na.rm = TRUE)

# Get mean of matrix rows.
apply(mymatrix, 1, mean, na.rm = TRUE)
rowMeans(mymatrix, na.rm = TRUE)

# Add row means to mydata.
mydata$meanQ <- apply(mymatrix, 1, mean, na.rm = TRUE)
mydata$meanQ <- rowMeans(mymatrix, na.rm = TRUE)
mydata <- transform(mydata,
  meanQ = rowMeans(mymatrix, na.rm = TRUE)
)
mydata

# Means of data frames & their vectors.
lapply(mydata[ ,3:6], mean, na.rm = TRUE)
sapply(mydata[ ,3:6], mean, na.rm = TRUE)

mean(
  sapply(mydata[ ,3:6], mean, na.rm = TRUE)
)

# Length of data frames & their vectors.
length(mydata[ ,"q3"] )
nrow(mydata)
is.na( mydata[ ,"q3"] )
!is.na( mydata[ ,"q3"] )
sum( !is.na( mydata[ ,"q3"] ) )

# Like the SAS/SPSS n from stat procedures.
library("prettyR")
sapply(mydata, valid.n)

apply(myMatrix, 1, valid.n)
mydata$myQn <- apply(myMatrix, 1, valid.n)
mydata
LIBNAME myLib 'C:\myRfolder';

DATA myLib.mydata;

SET  myLib.mydata;
myMean = MEAN(OF q1-q4);
myN = N(OF q1-q4);
RUN;

PROC MEANS;

  VAR q1-q4 myMean myN;
RUN;
 GET FILE='mydata.sav'.

* Functions work for each observation (row).

COMPUTE myMean = Mean(q1 TO q4).
COMPUTE mySum  = Sum(q1 TO q4).
COMPUTE myN = mySum / myMean.
* Filename: FunctionsCommands.do

use c:\myRfolder\mydata, clear
preserve

* Some statistical functions
egen mymean = rowmean(q1-q4)
egen mysum = rowtotal(q1-q4)
gen myn = mysum/mymean
restore

* A statistical command
summary




By or Split-File Processing

 R SAS SPSS Stata
setwd("c:/myRfolder")
load(file = "mydata.RData")
attach(mydata)

# Get means of q variables for all observations.
mean( mydata[ c("q1", "q2",
                "q3", "q4") ] ,
       na.rm = TRUE)

# Now get means by gender.
myBYout <- by( mydata[
  c("q1", "q2", "q3", "q4") ] ,
  mydata["gender"],
  mean,na.rm = TRUE)
myBYout
mode(myBYout)
class(myBYout)
myBYdata <- as.data.frame( (as.table(myBYout) ) )
myBYdata

# Get range by workshop and gender
myVars <- c("q1", "q2", "q3", "q4")
myBys  <- mydata[
  c("workshop", "gender") ]
myBYout <- by( mydata[myVars],
  myBys, range, na.rm = TRUE )

# Converting output to data frame.
# A data frame the long way.
myBYdata <- data.frame(
  rbind(myBYout[[1]], myBYout[[2]],
        myBYout[[3]], myBYout[[4]])
)

# A data frame using do.call.
myBYdata <- data.frame( do.call( rbind, myBYout) )
myBYdata
PROC MEANS DATA=myLib.mydata;
  RUN;

PROC SORT DATA=myLib.mydata;
  BY gender;
  RUN;
PROC MEANS DATA=myLib.mydata;
  BY gender;
  RUN;

PROC SORT DATA=myLib.mydata;
  BY workshop gender;
  RUN;
PROC MEANS DATA=myLib.mydata;
  BY workshop gender;
  RUN;
GET FILE='mydata.sav'.

DESCRIPTIVES
  VARIABLES=q1 q2 q3 q4
  /STATISTICS=MEAN STDDEV MIN MAX .

SORT CASES BY gender .
SPLIT FILE
  SEPARATE BY gender .
DESCRIPTIVES
  VARIABLES=q1 q2 q3 q4
  /STATISTICS=MEAN STDDEV MIN MAX .

SORT CASES BY workshop gender .
SPLIT FILE
  SEPARATE BY workshop gender .
DESCRIPTIVES
  VARIABLES=q1 q2 q3 q4
  /STATISTICS=MEAN STDDEV MIN MAX .
use c:\myRfolder\mydata, clear
sort gender
by gender: sum gender
tabstat q1 q2 q3 q4, stat(mean sd min max)
tabstat q1 q2 q3 q4, stat(mean sd min max) by(gender)



Character String Manipulations

 R SAS SPSS Stata
# Filename: CharacterStrings.R

gender <-c("m", "f", "m", NA, "m", "f", "m", "f")

gender

options(width = 58)

letters
LETTERS

library("stringr")

myVars <- str_c("Var", LETTERS[1:6])
myVars

setwd("c:/myRfolder")

giants <- read.fwf(
   file  = "giants.txt",
   width = c(15, 11, 11),
   col.names  = c("name",      "born",      "died"),
   colClasses = c("character", "character", "POSIXct")
)
giants

str_length( giants$name )


giants[ giants$name == "R.A. Fisher", ]

giants[ giants$name == "R.A. Fisher    ", ]

giants$name <- str_trim(giants$name)

attach(giants)
str_length(name)

toupper(name)

tolower(name)
library("ggplot2")
firstUpper( tolower(name) )

str_sub(name, 1, 5)


myNamesMatrix <- str_split_fixed(name, " ", 2)

myNamesMatrix

myFirst <- myNamesMatrix[ ,1]

myFirst
myLast  <- myNamesMatrix[ ,2]
myLast

myFirst <- str_replace_all(myFirst, "R.A.", "Ronald A.")


myLastFirst <- str_c( myLast, ", ", myFirst)

myLastFirst

myObs <- myLast == "Tukey"

myObs
myObs <- which(myLast == "Tukey")
myObs
giants[ myObs, ]

myObs <- str_detect(myLast, "key")

myObs

myTable <- c("Box", "Bayes", "Fisher", "Tukey")

myObs <- myLast %in% myTable
myObs
name[ myObs ]

myObs <- str_detect( myLast, "Box|Bayes|Fish|key" )

myObs
name[ myObs ]

myAthruM <- str_detect(myLastFirst, "^[A-M]")

myAthruM
name[ myAthruM ]
name[!myAthruM ]
* Filename: CharacterStrings.sas

LIBNAME MYlIB 'C:\myRfolder';

DATA myLib.giants;
INFILE '\myRfolder\giants.txt'
  MISSOVER DSD LRECL=32767;
INPUT name $char14. @16 born mmddyy10. @27 died yymmdd10.;
FORMAT born mmddyy10. died yymmdd10.;
myVarLength=length(name);
born=strip(born); *Not needed;

PROC PRINT;
  RUN;

DATA  myLib.giants;
  SET myLib.giants;
  myLower= lowcase(name);
  myUpper= upcase(name);
  myProper=propcase(name);
PROC PRINT;   RUN;

DATA  myLib.giants;
  SET myLib.giants;
  myFirst5=substr(name, 1, 5);
  * split names using substr;
  myBlank=find(name, " ");
  myFirst=strip( substr(name, 1, myBlank) );
  myLast =strip( substr(name, myBlank)    );
  PUT "Using substr... " myFirst= myLast=;
  * splip names using scan;
  myFirst=scan(name,1," ");
  myLast =scan(name,2," ");
  myFirst=tranwrd(myFirst,"R.A.","Ronald A.");
  LENGTH myLastFirst $ 17;
  myLastFirst= strip(myLast) || ", " || strip(myFirst);
  *or: CALL CATX(", ", myLastFirst, myLast, myFirst);
PROC PRINT; VAR name myFirst myLast myLastFirst;
  RUN;
DATA  tukey;
  SET myLib.giants;
  WHERE myLast="Tukey";
PROC PRINT;
  VAR name;
  RUN;
DATA  tukey;
  SET myLib.giants;
  WHERE FIND(myLast, "key");
PROC PRINT;
  VAR name;
  RUN;
DATA  mySubset;
  SET myLib.giants;
  WHERE myLast IN ("Box","Bayes","Fisher","Tukey");
  RUN;
PROC PRINT;
  VAR name;
  RUN;
DATA  FishOrKey;
  SET myLib.giants;
  IF FIND(myLast, "Box")   |
     FIND(myLast, "Bayes") |
     FIND(myLast, "Fish")  |
     FIND(myLast, "key") ;
  RUN;
PROC PRINT;
  VAR name;
  RUN;
DATA AthruM;
  SET myLib.giants;
  firstLetter=substr(myLast, 1, 1);
  IF "A" <= firstLetter <= "M";
  RUN;
PROC PRINT;
  VAR name;
  RUN;
* Filename: CharacterStrings.sps

CD 'C:\myRfolder'.

DATA LIST FILE='giants.txt' RECORDS=1
 /1 name 1-14 (A) born 16-26 (ADATE) died 27-37 (SDATE).

STRING myFirst5 (A5)/ myLower myUpper myLastFirst (A17)
  myFirst myLast (A9).
COMPUTE myLength1=LENGTH(name).
COMPUTE name=RTRIM(name).
COMPUTE myLength2=LENGTH(name).
COMPUTE myLower=LOWER(name).
COMPUTE myUpper=UPCASE(name).
LIST name myLower myUpper myLength1 myLength2.

COMPUTE myFirst5=SUBSTR(name, 1, 5).
COMPUTE myBlank= INDEX(name, " ").
COMPUTE myFirst=SUBSTR(name, 1, myBlank-1).
COMPUTE myFirst=REPLACE(myFirst, "R.A.", "Ronald A.").
COMPUTE myLast=SUBSTR(name, myBlank+1).
COMPUTE myLastFirst=CONCAT( RTRIM(myLast),
  ", ", RTRIM(myFirst) ).
LIST name myFirst myLast myLastFirst.

TEMPORARY.
SELECT IF (myLast EQ "Tukey").
LIST name.

TEMPORARY.
SELECT IF (CHAR.RINDEX(myLast,"Fish") GE 1
  OR CHAR.RINDEX(myLast,"key") GE 1).
LIST name.

TEMPORARY.
SELECT IF (
  myLast EQ "Box" OR
  myLast EQ "Bayes" OR
  myLast EQ "Fisher" OR
  myLast EQ "Tukey").
LIST name.

TEMPORARY.
SELECT IF (
  name EQ "Box" OR
  name EQ "Bayes" OR
  CHAR.RINDEX(myLast,"Fish") GE 1 OR
  CHAR.RINDEX(myLast,"key") GE 1).
LIST name.

TEMPORARY.
SELECT IF(SUBSTR(myLast, 1, 1) LE "M").
LIST name.





Date and Time Manipulations -
These examples uses a different practice data set named "giants.txt":

                     born       died
R.A. Fisher    1890-02-17 1962-07-29
Carl Pearson   1857-03-27 1936-04-27
Gertrude Cox   1900-01-13 1978-10-17
John Tukey     1915-06-16 2000-07-26
William Gosset 1876-06-13 1937-10-16


 R SAS SPSS Stata
setwd("c:/myRfolder")
giants <- read.fwf(
   file        = "giants.txt",
   width       = c(15,11,11),
   col.names   = c("name",      "born",      "died"),
   colClasses  = c("character", "character", "POSIXct"),
   row.names   = "name",
   strip.white = TRUE,
)
giants
class(giants$born)  # A character vector.

library("lubridate")
giants$born <- mdy(giants$born)
giants

class(giants$born)
class(giants$died)

giants  # They display in yyyy-mm-dd by default.
attach(giants)

unclass( born )
as.numeric( born )
as.POSIXct(
  c(-2520460800,-3558556800,-2207952000,
    -1721347200, -2952201600),
  origin="1960-01-01", tz="UTC" )

#---Calculating Durations---

age <- difftime(died, born, units="secs")
age
age <- difftime(died, born)
age # now we have age in days

mode( age )
class( age )  # it's a difftime object

as.period(age)

age/365.2425  # age in years
giants$age <- round(
  as.numeric( age/365.2425 ), 2 )
giants

now()  # Current date-time.
difftime( now(), died ) / 365.2425

# Again, using subtraction.
age <- died - born  # age in days
age
age / 365.2425   # Age in years, mislabeled.
mode(age)
class( age )     # it's an interval object.
names( age )
sapply( age, unclass )
# Not a helpful age to store in our data frame!


#---Adding Durations to Date-Times---

age <- as.duration(
         c(2286057600,2495664000,2485382400,
         2685916800,1935705600)
)
class(age)
born+age
died


#---Accessing Date-Time Elements---

year(born)
month(born)
month(born, label = TRUE)
day(born)   # day of month
wday(born)  # day of week
wday(born, label = TRUE, abbr = FALSE)
yday(born)

#---Creating Date-Times from Elements---

myYear  <- year(died)
myMonth <- month(died)
myDay   <- day(died)

myDateString <- paste(myYear, myMonth, myDay, sep="/")
myDateString
died2 <- ymd(myDateString)
died2

#---Logical Comparisons with Date-Times---

giants[ born > mdy("1/1/1900") , ]

#---SAS Picture Format Example---

myDateText <- format(born, "%B %d, %Y is day %j of %Y")
myDateText

myDateText <- format(born,
  "was born on the %jth day of %Y")
for (i in 1:5) cat(rownames(giants)[i],
  myDateText[i],"\n")

# Two-Digit Years

my1969 <- mdy("08/31/69")
my1969
my1968 <- mdy("08/31/68")
my1968
my1968 <- my1968 - as.duration(100 * 365.2425 * 24 * 60 * 60)
my1968

as.POSIXlt("08/31/68", format="%m/%d/%y")
as.POSIXlt("08/31/69", format="%m/%d/%y")

as.POSIXlt("08/31/69", format="%m/%d/%y")
CD 'C:\myRfolder'.

DATA LIST FILE='giants.txt' RECORDS=1
 /1 name 1-14 (A) born 16-26 (ADATE) died 27-37 (SDATE).

* Calculating Durations.
COMPUTE age=died-born.
LIST.
COMPUTE age=(died-born) / (365.2425*24*60*60).
LIST.
COMPUTE longAgo=($TIME-died) /(365.2425*24*60*60) .
LIST.

* Adding Durations to Date-Times.
COMPUTE died=born+age.
LIST.

* Accessing Date-Time Elements.
COMPUTE myYear=XDATE.YEAR(born).
COMPUTE myMonth=XDATE.MONTH(born).
COMPUTE myDay=XDATE.MDAY(born).
LIST name born myYear myMonth myDay.

* Creating Date-Time Variables from Elements.
COMPUTE born=DATE.MDY(myMonth, myDay, myYear).
LIST name born.

* Logical Comparisons with Date-Times.
TEMPORARY.
SELECT IF born GE date.mdy(1,1,1900).
LIST name born.

* Formatting Date-Time Output.
PRINT /born (adate) ' is the' myDay (F3.0)
  'th day of ' myYear (F4.0).
EXECUTE.

PRINT /name 'was born on the' myDay (F3.0)
  'th day of ' myYear (F4.0).
EXECUTE.
LIBNAME MyLib 'C:\myRfolder';

DATA myLib.giants;
INFILE '\myRfolder\giants.txt'
  MISSOVER DSD LRECL=32767;
INPUT name $char14. @16 born mmddyy10. @27 died mmddyy10.;

PROC PRINT;
  RUN;

PROC PRINT;
  FORMAT died born mmddyy10.;
  RUN;

* Caculating Durations.;
DATA myLib.giants;
SET  myLib.giants;
  age  = (died-born)/365.2425;
  longAgo = ( today()-died )/365.2425;
  RUN;

PROC PRINT;
FORMAT died born mmddyy10. age longAgo 5.2 ;
RUN;

* Adding Durations to Date-Times.
DATA myLib.giants;
SET myLib.giants;
  died=born+age;
  RUN;
PROC PRINT;
FORMAT died born mmddyy10. age 5.2;
RUN;

* Accessing Date-Time Elements;
DATA myLib.giants;
  SET myLib.giants;
  myYear=YEAR(born);
  myMonth=MONTH(born);
  myDay  =DAY(born);
PROC PRINT;
  FORMAT died born mmddyy10. age 5.2;
RUN;

* Creating Date-Time Variables from Elements;
DATA myLib.giants;
  set myLib.giants;
  born=MDY(myMonth, myDay, myYear);
  PROC PRINT;
  FORMAT died born mmddyy10. age 5.2;
RUN;

* Logical Comparisons with Date-Times;
DATA Born1900s;
  set myLib.giants;
  if born > "01jan1900"d ;
PROC PRINT;
  FORMAT died born mmddyy10. age 5.2;
  RUN;

* Formatting Date-Time Output;
PROC FORMAT;
  PICTURE myFormatI
  LOW - HIGH = '%B %d, %Y is day %j of %Y'
    (DATATYPE=DATE);
  RUN;
PROC PRINT DATA=myLib.giants;
  VAR born;
  FORMAT born myFormatI40.;
  RUN;

PROC FORMAT;
  PICTURE myFormatII
  LOW - HIGH = ' was born on the %jth day of %Y'
    (DATATYPE=DATE);
  RUN;
DATA _NULL_;
  SET myLib.giants;
  PUT name $char14. born myFormatII34.;
  run;




Duplicate Observations: Finding and Removing

 R SAS SPSS Stata
load("mydata.RData")
mydata

# Create some duplicates.
myDuplicates <- rbind(mydata, mydata[1:2, ])
myDuplicates

# Get rid of duplicates without seeing them.
myNoDuplicates <- unique(myDuplicates)
myNoDuplicates

# This checks for location of duplicates
# before getting rid of them.

myDuplicates <- rbind(mydata, mydata[1:2, ])
myDuplicates

myDuplicates$DupRecs <- duplicated(myDuplicates)
myDuplicates

# Print a report of just the duplicate records.
attach(myDuplicates)
myDuplicates[DupRecs, ]

# Remove duplicates and Duplicated variable.
myNoDuplicates <-
  myDuplicates[!DupRecs, -7 ]
myNoDuplicates

# Locate records with duplicate keys.
myKeys <- c("workshop", "gender")
mydata$DupKeys <-
  duplicated(mydata[ ,myKeys])
mydata
DATA mycopy; SET myLib.mydata;
Data lastTwo;
  SET myLib.mydata;
  IF ID GE 7;
  RUN;

DATA Duplicates;
  SET mycopy lastTwo;
  PROC PRINT; RUN;

PROC SORT NODUPREC DATA=Duplicates;
  BY id workshop gender q1-q4;
RUN;

PROC PRINT;
RUN;

PROC SORT NODUPKEY EQUALS DATA=mycopy;
  BY workshop gender;
RUN;

PROC PRINT DATA=mycopy;
RUN;
GET FILE='mydata.sav'.
* Identify Duplicate Cases.
SORT CASES BY workshop(A) gender(A)
  q2(A) q1(A) q3(A) q4(A) .
MATCH FILES /FILE = *
 /BY workshop gender q2 q1 q3 q4
 /FIRST = PrimaryFirst
 /LAST = PrimaryLast.
DO IF (PrimaryFirst).
+ COMPUTE MatchSequence = 1 - PrimaryLast.
ELSE.
+ COMPUTE MatchSequence = MatchSequence + 1.
END IF.
LEAVE MatchSequence.
FORMAT MatchSequence (f7).
COMPUTE InDupGrp = MatchSequence > 0.
SORT CASES InDupGrp(D).
MATCH FILES /FILE = *
 /DROP = PrimaryFirst InDupGrp MatchSequence.
VARIABLE LABELS
  PrimaryLast 'Indicator of each last matching case as Primary'.
VALUE LABELS PrimaryLast
  0 'Duplicate ‘Duplicate Case' Case’
  1 'Primary ‘Primary Case'Case’.
VARIABLE LEVEL PrimaryLast (ORDINAL).
FREQUENCIES VARIABLES = PrimaryLast .
use c:\myRfolder\mydata, clear
duplicates drop id workshop gender q1-q4, force
list



First / Last Observations per Group, Finding

 R SAS SPSS Stata
setwd("c:/myRfolder")
load(file = "mydata.RData")
mydata$id <- row.names(mydata)
mydata

myBys <-
  data.frame(mydata$workshop, mydata$gender)
mylastList <- by(mydata, myBys, tail, n = 1)
mylastList

#Back into a data frame:
mylastDF <- do.call(rbind, mylastList)
mylastDF

# Another way to create the data frame:
mylastDF <- rbind(mylastList[[1]],
                  mylastList[[2]],
                  mylastList[[3]],
                  mylastList[[4]])
mylastDF

# Generating just an indicator variable
mylastDF$lastGender <- rep(1, nrow(mylastDF) )
mylastDF

mylastDF2 <- mylastDF[ c("id", "lastGender") ]
mydata2 <- merge(mydata, mylastDF2, by = "id", all = TRUE )
mydata2

mydata2$lastGender[
  is.na(mydata2$lastGender) ] <- 0
mydata2
PROC SORT DATA=sasuser.mydata;
  BY workshop gender;
RUN;

DATA sasuser.mylast;
  SET  sasuser.mydata;
  BY   workshop gender;
  IF   last.gender;
RUN;

PROC PRINT; RUN;
* Match files method.
GET FILE='mydata.sav'.
SORT CASES BY workshop gender.
MATCH FILES FILE=* /By workshop gender /LAST=lastgender.
SELECT IF lastgender.
LIST.
SAVE OUTFILE='mylast.sav'.

* Aggregation method.
SORT CASES BY workshop gender.
AGGREGATE /OUTFILE='C:\mylast.sav'
/BREAK workshop gender
/q1 = LAST(q1)
/q2 = LAST(q2)
/q3 = LAST(q3)
/q4 = LAST(q4).

* Using LIST here would display original file.
GET FILE='mylast.sav'.
DATASET NAME DataSet5 WINDOW=FRONT.
LIST.
use c:\myRfolder\mydata, clear
sort workshop gender
by workshop gender: keep if _n==_N
list



Indicator or Dummy Variables -
R will create these automatically if you use a factor as a regression predictor.

 R SAS SPSS Stata
load("mydata100.RData")
attach(mydata100)

r     <- as.numeric(workshop == "R"    )
sas   <- as.numeric(workshop == "SAS"  )
spss  <- as.numeric(workshop == "SPSS" )
stata <- as.numeric(workshop == "Stata")

lm(posttest ~ pretest + sas + spss + stata)

# Same result without them:
lm(posttest ~ pretest + workshop)

DATA  temp;
SET myLib.mydata100;
  r     = workshop = 1;
  sas   = workshop = 2;
  spss  = workshop = 3;
  stata = workshop = 4;
RUN;

PROC REG;
  MODEL posttest = pretest sas spss stata;
  RUN;
GET FILE='mydata100.sav'.
DATASET NAME DataSet2 WINDOW=FRONT.

COMPUTE r     = workshop EQ 1.
COMPUTE sas   = workshop EQ 2.
COMPUTE spss  = workshop EQ 3.
COMPUTE stata = workshop EQ 4.
EXECUTE.

REGRESSION
  /DEPENDENT posttest
  /METHOD=ENTER pretest sas spss stata.
EXECUTE.



Joining or Merging Data Sets

 R SAS SPSS Stata
# Read data keeping ID as a variable.
mydata <- read.table("mydata.csv",
  header = TRUE, sep = ",",
  na.strings = " ")

# Create a data frame keeping
# the left two q variables.
myleft <- mydata[ c("id",
  "workshop", "gender", "q1",
  "q2") ]

# Create a data frame keeping the right two q variables.
myright <- mydata[ c("id",
  "workshop", "q3", "q4") ]

# Merge the two data frames
# by ID.
both <- merge(myleft, myright,
  by = "id")

# Merge the two data frames
# by ID allowing different
# ID names in each data set.
both <- merge(myleft, myright,
  by.x = "id", by.y = "id" )

# Merge data frames
# by both ID and workshop.
both <- merge(myleft, myright,
  by = c("id","workshop"))

#Merge dataframes by both ID
# and workshop, while allowing
# them to have different names.
both <- merge(myleft, myright,
  by.x=c("id", "workshop"),
  by.y=c("id", "workshop") )
DATA  myLib.myleft;
  SET mylib.mydata;
  KEEP id workshop gender
    q1 q2;
PROC SORT; BY id workshop;

DATA  myLib.myright;
  SET myLib.mydata;
  KEEP id workshop q3 q4;
PROC SORT; BY id workshop;

DATA  myLib.both;
  MERGE myLib.myleft
        myLib.myright;
  BY id workshop;
RUN;
GET FILE='mydata.sav'.
DELETE VARIABLES q3 to q4.
SAVE OUTFILE='myleft.sav'.

GET FILE='mydata.sav'.
DELETE VARIABLES gender,
  q1 to q2.
SAVE OUTFILE='myright.sav'.

GET FILE='myleft.sav'.
MATCH FILES /FILE=*
 /FILE='myright.sav'
 /BY id.
* Filename: Merge.do

use c:\myRfolder\mydata, clear
drop q3 q4
save c:\myRfolder\myleft

use c:\myRfolder\mydata, clear
drop workshop-q2
save c:\myRfolder\myright

use c:\myRfolder\myleft, clear
sort id

merge id using c:\myRfolder\myright
save c:\myRfolder\both, replace


Keeping or Dropping Variables

 R SAS SPSS Stata
# Keep using variable selection.
myleft <- mydata[ ,1:4]
myleft

# Drop using NULL.
myleft <- mydata
myleft$q3 <- myleft$q4 <- NULL
* Keeping vars;
DATA myleft;
  SET mydata;
  KEEP id workshop gender  
    q1 q2;

*Equivalently dropping;
DATA myleft; SET mydata;
  DROP q3 q4;
GET FILE='mydata.sav'.
DELETE VARIABLES q3 to q4.

use c:\myRfolder\mydata, clear
keep id workshop gender q1 q2
*or equivalently;
* drop q3 q4
save c:\myRfolder\myleft, replace



Missing Values -
These examples use a different version of mydata which includes periods, 9's and 99's as missing value codes. Note that the periods in R are not read automatically as missing values! Here is mydataNA:

  workshop gender q1 q2 q3 q4
1        1      f  1  1  5  1
2        2      f  2  1  4 99
3        .      f  9  2  4  3
4        2      .  3  9 99  3
5        1      m  4  5  2  4
6        .      m  9  9  5  5
7        1      .  5  3 99  4
8        2      m  4  5  5 99


 R SAS SPSS Stata
setwd("c:/myRfolder")

mydataNA <- read.table("mydataNA.txt")
mydataNA

# Read it so that ".", 9, 99 are missing.
mydataNA <- read.table("mydataNA.txt",
  na.strings = c(".", "9", "99") )
mydataNA

# Convert 9 and 99 manually
mydataNA <- read.table("mydataNA.txt",
  na.strings=".")
mydataNA[mydataNA == 9 | mydataNA == 99] <- NA
mydataNA
# Substitute the mean for missing values.
mydataNA$q1[is.na(mydataNA$q1)] <-
  mean(mydataNA$q1, na.rm = TRUE)
mydataNA

# Eliminate observations with any NAs.
myNoMissing <- na.omit(mydataNA)
myNoMissing

# Test to see if each case is complete.
complete.cases(mydataNA)

# Use that result to select complete cases.
myNoMissing <- mydataNA[ complete.cases(mydataNA), ]
myNoMissing

# Use that result to select incomplete cases.
myIncomplete <- mydataNA[ !complete.cases(mydataNA), ]
myIncomplete

# When "99" Has Meaning...
mydataNA <- read.table("mydataNA.txt", na.strings = ".")
mydataNA
attach(mydataNA)

# Assign missing values for q variables.
mydataNA$q1[q1 ==  9] <- NA
mydataNA$q2[q2 ==  9] <- NA
mydataNA$q3[q3 == 99] <- NA
mydataNA$q4[q4 == 99] <- NA
mydataNA
detach(mydataNA)

# Read file again, this time use functions.
mydataNA <- read.table("mydataNA.txt", na.strings = ".")
mydataNA
attach(mydataNA)

#Create a functions that replaces 9, 99 with NAs.
my9isNA   <- function(x) { x[x ==  9] <- NA; x}
my99isNA  <- function(x) { x[x == 99] <- NA; x}

# Now apply our functions to the data frame using lapply.
mydataNA[3:4] <- lapply( mydataNA[3:4], my9isNA )
mydataNA[5:6] <- lapply( mydataNA[5:6], my99isNA )
mydataNA
DATA myLib.mydata;
SET  myLib.mydata;

*Convert 9 to missing, one at a time.
IF q1=9 THEN q1=.;
IF q2=9 THEN q2=.;
IF q3=99 THEN q2=.;
IF q4=99 THEN q4=.;

* Same thing but is quicker for lots of vars;
ARRAY q9 q1-q2;
DO OVER q9;
   IF q9=9 THEN q=.;
END;

ARRAY q99 q3-q4;
DO OVER q99;
   IF q=99 THEN q99=.;
END;
CD 'C:\myRfolder'.
GET FILE=('mydata.sav').

MISSING q1 TO q2 (9) q3 TO q4 (99).

SAVE OUTFILE='mydata.sav'.



* Filename: MissingValues.do

use c:\myRfolder\mydata, clear
preserve
replace q1=. if q1==9
replace q2=. if q2==9
replace q3=. if q3==99
replace q4=. if q4==99

* Same thing but is quicker for lots of vars
restore
forvalues i = 1/2 {
replace q`i'=. if q`i'==9
}
forvalues i = 3/4 {
replace q`i'=. if q`i'==90
}
restore



Ranks, Calculating

 R SAS SPSS Stata
# Applies to vars 3 through 6,
# the vars q1 to q4.

myvars <- c("q1","q2","q3","q4")
myRanks <-
  sapply(mydata[myvars], rank)


PROC RANK DATA=mylib.mydata
  OUT=myRanks;
  VAR Q1-Q4;
PROC PRINT;
 RANK VARIABLES=q1 q2 q3 q4 (A)
  /RANK
  /PRINT=YES
  /TIES=MEAN.




Recoding Variables

 R SAS SPSS Stata
load(file = "myWorkspace.RData")
mydata
attach(mydata)

library("car")
mydata$qr1 <- recode(q1, "1=2; 5=4")
mydata$qr2 <- recode(q2, "1=2; 5=4")
mydata$qr3 <- recode(q3, "1=2; 5=4")
mydata$qr4 <- recode(q4, "1=2; 5=4")
detach(mydata)

# This approach works on thousands.

load(file = "mydata.RData")
attach(mydata)

# Generate two sets of var names
# so we can save original vars.
# QR vars will be the recoded ones.
myQnames <- 
  paste( "q",  1:4, sep = "")
myQRnames <-
  paste( "qr", 1:4, sep = "")

# Extract the q variables to a separate data frame.
myQRvars <- mydata[ ,myQnames]

# Rename all of the variables.
names(myQRvars) <- myQRnames
myQRvars

# Create a function to apply the labels to lots of variables.
myRecoder <- function(x) { recode(x,"1=2; 5=4") }

# Here's how to use the function
# on one variable.
myRecoder(myQRvars$qr1)

#Apply it to all of the variables.
myQRvars <- sapply( myQRvars, myRecoder)
myQRvars

# Save it back to mydata
# if you want.
mydata <- cbind(mydata,myQRvars)

DATA myLib.mydata;
INFILE "mydata.csv"  
  delimiter = ","
  MISSOVER DSD  
  LRECL=32767  
  firstobs=2;

INPUT id workshop gender $ q1 q2 q3 q4;

PROC PRINT; RUN;

PROC FORMAT;
  VALUE Agreement
  1="Disagree"
  2="Disagree"
  3="Neutral"
  4="Agree"
  5="Agree"; run;

DATA myLib.mydata;
  SET myLib.mydata;
  ARRAY q q1-q4;
  ARRAY qr qr1-qr4;
  *r for recoded;
  DO i=1 to 4;
     qr{i}=q{i};
     if q{i}=1 then
       qr{i}=2;
     else if
       q{i}=5 then qr{i}=4;
  END;
  FORMAT q1-q4 qr1-qr4
    Agreement.;
RUN;

* This will use the recoded formats automatically;

PROC FREQ; TABLES q1-q4; RUN;

* This will ignore the formats;
* Note high/low values are 1/5;
PROC UNIVARIATE;
  VAR q1-q4; RUN;

* This will use the 1-3 codings, not a good idea!;
* High/Low values are now 2/4;

PROC UNIVARIATE;
  VAR qr1-qr4;
  RUN;
GET FILE='mydata.sav'.

RECODE q1 to q4 (1=2) (5=4).

SAVE OUTFILE='myleft.sav'.
* Filename: Recode.do

use c:\myRfolder\mydata, clear
recode q1-q4 (1=2) (5=4)



Renaming Variables

 R SAS SPSS Stata
library("plyr")
# Use old="new" format.

myChanges <-
  c(q1 = "x1", q2 = "x2",
    q3 = "x3", q4 = "x4")
mydata <- rename(mydata, myChanges)

# The standard R approach is
# more complex when renaming just
# a few variables:

names(mydata) <- c("workshop", "gender", "x1", "x2", "x3", "x4")

# See books for many more examples.
DATA myLib.mydata;
  RENAME q1-q4=x1-x4;
  *or;
  RENAME q1=x1  q2=x2 
         q3=x3  q4=x4;
 GET FILE='mydata.sav'.
RENAME VARIABLES (q1=x1)(q2=x2)(q3=x3)(q4=x4).
* Or...
RENAME VARIABLES (q1 q2 q3 q4 = x1 x2 x3 x4).
* Or...
RENAME VARIABLES (q1 TO q4 = x1 TO x4).
* Filename: Rename.do

use c:\myRfolder\mydata, clear
preserve
rename q1 x1
rename q2 x2
rename q3 x3
rename q4 x4

* or
restore
forvalues i = 1/4 {
rename q`i' x`i'
}



Reshaping Variables to Observations and Back

 R SAS SPSS Stata
# Filename: Reshape.R

setwd("c:/myRfolder")
load("mydata.RData")

library("reshape2")
myChanges <- c(
  q1 = "time1",
  q2 = "time2",
  q3 = "time3",
  q4 = "time4")
mydata <- rename(mydata, myChanges)
mydata$subject <- factor(1:8)
mydata

# Reshaping from wide to long
library("reshape2")  # Just a reminder
mylong <- melt(mydata)
mylong

# Again, specifying arguments
mylong <- melt(mydata,
  id.vars      = c("subject", "workshop", "gender"),
  measure.vars = c("time1", "time2", "time3", "time4"),
  value.name   = "variable")
mylong

# Reshaping from long to wide
mywide <- dcast(mylong,
  subject + workshop + gender ~ variable)
mywide

# ---Two Time Variables---

load("mydata.RData")
mydata$subject <- factor(1:8)
library("reshape2")
myChanges <- c(
  q1 = "M1_L1",
  q2 = "M1_L2",
  q3 = "M2_L1",
  q4 = "M2_L2")
mydata <- rename(mydata, myChanges)
mydata

library("reshape2")  # Just a reminder
mylong2 <- melt(mydata)
mylong2

# Same thing with arguments specified
mylong2 <- melt(mydata,
  id.vars      = c("subject", "workshop", "gender"),
  measure.vars = c("M1_L1", "M1_L2", "M2_L1", "M2_L2"),
  value.name   = "value")
mylong2

mylong2$method <- rep( c("M1", "M2"), each=16, times=1)
mylong2$level  <- rep( c("L1", "L2"), each=8,  times=2)
mylong2
mylong2$variable <- NULL

# Reshape to wide
mywide2 <- dcast(mylong2,
  gender + workshop + subject ~ method + level)
mywide2
mywide2[ order(mywide2$subject), ]

# Aggregation via reshape

dcast(mylong2, gender + workshop ~ method + level,
  mean, na.rm = TRUE)
dcast(mylong2, gender + workshop ~ method,
  mean, na.rm = TRUE)
dcast(mylong2, workshop ~ .,
  mean, na.rm = TRUE)
* Filename: Reshape.sas ;

LIBNAME myLib 'C:\myRfolder';

* Wide to long;
PROC TRANSPOSE DATA=mylib.mydata
  OUT=myLib.mylong;
  VAR q1-q4;
  BY id workshop gender;
PROC PRINT;
RUN;
DATA mylib.mylong;
  SET mylib.mylong( rename=(COL1=value) );
  time=INPUT( SUBSTR( _NAME_, 2) , 1.);
  DROP _NAME_;
RUN;
PROC PRINT;
RUN;

* Long to wide;
PROC TRANSPOSE DATA=mylib.mylong
  OUT=myLib.mywide PREFIX=q;
  BY id workshop gender;
  ID time;
  VAR value;
RUN;
DATA mylib.mywide;
  SET mylib.mywide(DROP=_NAME_);
RUN;
PROC PRINT;
RUN;
* Filename: Reshape.sps .

CD 'C:\myRfolder'.
GET FILE='mydata.sav'.
* Wide to long.
VARSTOCASES  /MAKE Y FROM q1 q2 q3 q4
 /INDEX = Question(4)
 /KEEP =  id workshop gender
 /NULL = KEEP.
LIST.
SAVE OUTFILE='mywide.sav'.
* Long to wide.
GET FILE='mywide.sav'.
CASESTOVARS
 /ID = id workshop gender
 /INDEX = Question
 /GROUPBY = VARIABLE.
LIST.
SAVE OUTFILE='mylong.sav'.
* Filename: Reshape.do

use c:\myRfolder\mydata, clear

reshape long q, i(subject) j(item)
list

reshape wide q, i(subject) j(item)
list



Sorting Data Sets

 R SAS SPSS Stata
setwd("c:/myRfolder")
load(file = "mydata.RData")
mydata

# Show first four observations in order.
mydata[ c(1, 2, 3, 4), ]

# Show them in reverse order.
mydata[ c(4, 3, 2, 1), ]

# Create order variable for workshop.
myW <- order( mydata$workshop )
myW
mydata[ myW, ]

# Create order variable for gender then workshop.
myGW <- order( mydata$gender, mydata$workshop )
myGW
mydata[ myGW, ]

# Create order variable for
# descending (-) workshop then gender
myWdG <- order( -mydata$workshop, mydata$gender )
myWdG

# Print data in WG order.
mydata[ myWdG, ]

# Save data in WdG order.
mydataSorted <- mydata[ myWdG,  ]
mydataSorted
LIBNAME myLib '\myRfolder';

PROC SORT  DATA = myLib.mydata;
  BY workshop;
RUN;
PROC PRINT DATA = myLib.mydata;
RUN;

PROC SORT  DATA = myLib.mydata;
  BY gender workshop;
RUN;
PROC PRINT DATA = myLib.mydata;
RUN;

PROC SORT  DATA = myLib.mydata;
  BY descending gender workshop;
RUN;
PROC PRINT DATA = myLib.mydata;
RUN;
CD '\myRfolder'.
GET FILE = 'mydata.sav'.

SORT CASES BY workshop (A).
LIST.

SORT CASES BY gender (A) workshop (A).
LIST.

SORT CASES BY  gender (D) workshop (A).
LIST.
use c:\myRfolder\mydata, clear

* sort workshop ascending order
sort workshop
list

* sort gender and workshop, both ascending order
sort gender workshop
list

* sort workshop descending and gender in ascending order.
gsort - workshop + gender
list



Stacking or Concatenating or Adding Data Sets

 R SAS SPSS Stata
load(file = "mydata.RData")
mydata
attach(mydata)

# Create female data frame.
females <-
  mydata[which(gender == "f"),]

# Create male data frame.
males <-
  mydata[which(gender == "m"),]

# Bind their rows together
# with the rbind function.
both <- rbind(females, males)

# Drop q2 to see what happens.
males$q2 <- NULL
males

# See that row bind will
# not work now.
both <- rbind(females, males)

# Use plyr rbind.fill instead.
library("plyr")
both <-
  rbind.fill(females, males)

# Add a q2 variable to males.
males <- data.frame(males, q2 = NA)

# Now rbind can handle it.
both <-
  rbind(females, males)
DATA males;
  SET mydata;
  WHERE gender='m';
  RUN;

DATA females;
  SET mydata;
  WHERE gender='f';
  RUN;

DATA both;
  SET males females;
  RUN;
PROC PRINT; RUN;
GET FILE='mydata.sav'.
SELECT IF(gender = "f").
LIST.
SAVE OUTFILE='females.sav'.
EXECUTE .

GET FILE='mydata.sav'.
SELECT IF(gender = "m").
LIST.
SAVE OUTFILE='males.sav'.
EXECUTE .

GET FILE='females.sav'.
ADD FILES /FILE=*
 /FILE='males.sav'.
LIST.
EXECUTE .
use c:\myRfolder\mydata, clear
preserve

keep if gender=="m"
save c:\myRfolder\mymale
restore

keep if gender=="f"
save c:\myRfolder\myfemale
use c:\myRfolder\mymale, clear

append using c:\myRfolder\myfemale
save c:\myRfolder\both



Transformations

 R SAS SPSS Stata
load(file = "mydata.RData")
mydata

# Transformation in the middle
# of another function.
summary( log(mydata$q4) )

# Using dollar notation.
mydata$meanQ <-
  (mydata$q1 + mydata$q2 +
   mydata$q3 + mydata$q4) / 4
mydata

# Creating meanQ using transform.
mydata <- transform(mydata,
  meanQ=(q1 + q2 + q3 + q4) / 4 )
mydata






LIBNAME myLib 'C:\myRfolder';

DATA myLib.mydataTransformed;
SET  myLib.mydata;
totalq = (q1 + q2 + q3 + q4);
logtot = log10(totalq);
mean1 = (q1 + q2 + q3 + q4) / 4;
mean2 = mean(of q1-q4);

PROC PRINT; RUN;
GET FILE='mydata.sav'.

COMPUTE Totalq=q1+q2+q3+q4.
COMPUTE Logtot=lg10(totalq).
COMPUTE Mean1=(q1+q2+q3+q4)/4.
COMPUTE Mean2=MEAN(q1 TO q4).

SAVE OUTFILE='mydataTransformed.sav'.
LIST.
* Filename: Transform.do

use c:\myRfolder\mydata, clear
gen totalq = q1+q2+q3+q4
gen logtotalq = log10(totalq)
gen mean1 = totalq/4
egen mean2 = mean(q1-q4)
save c:\myRfolder\mydataT



Transformations, Conditional

 R SAS SPSS Stata
setwd("c:/myRfolder")
load(file = "mydata.RData")
mydata
attach(mydata)

mydata$q4Sagree <- ifelse(q4 == 5, 1, 0)
mydata$q4Sagree

mydata$q4Sagree <- as.numeric(q4 == 5 )
mydata$q4Sagree

mydata$q4agree <- ifelse(q4 >= 4, 1, 0)
mydata$q4agree

mydata$ws1agree <- ifelse(workshop == 1 & q4 >=4 , 1, 0)
mydata$ws1agree

mydata$score <- ifelse(gender == "f",
  (2 * q1) + q2,
  (3 * q1) + q2
)
mydata

# ---Cutting posttest---

detach(mydata)
load("mydata100.RData")
attach(mydata100)
head(mydata100)

# An inefficient approach:
postGroup <- posttest
postGroup <- ifelse(posttest< 60              , 1, postGroup)
postGroup <- ifelse(posttest>=60 & posttest<70, 2, postGroup)
postGroup <- ifelse(posttest>=70 & posttest<80, 3, postGroup)
postGroup <- ifelse(posttest>=80 & posttest<90, 4, postGroup)
postGroup <- ifelse(posttest>=90              , 5, postGroup)

table(postGroup)

# An efficient approach:
postGroup <-
ifelse(posttest <  60                 , 1,
  ifelse(posttest >= 60 &  posttest < 70, 2,
    ifelse(posttest >= 70 &  posttest < 80, 3,
      ifelse(posttest >= 80 &  posttest < 90, 4,
        ifelse(posttest >= 90                 , 5, posttest)
))))
table(postGroup)

# Logical approach:
postGroup <- 1+
  (posttest >= 60)+
  (posttest >= 70)+
  (posttest >= 80)+
  (posttest >= 90)
table(postGroup)

# ---Cutting Functions---

# Hmisc cut2 function
library("Hmisc")
postGroup <- cut2(posttest, c(60, 70, 80, 90) )
table(postGroup)

postGroup <- cut2(posttest, g = 5)
table(postGroup)

postGroup <- cut2(posttest, m = 25)
table(postGroup)
LIBNAME myLib 'C:\myRfolder';

DATA myLib.mydataTransformed;
SET myLib.mydata;
  IF q4  = 5 then x1 = 1; else x1 = 0;
  IF q4 >= 4 then x2 = 1; else x2 = 0;
  IF workshop = 1 & q4 >= 5 then  x3 = 1;
     ELSE x3 = 0;
  IF gender = "f" then scoreA = 2 * q1 + q2;
                  Else scoreA = 3 * q1 + q2;
  IF workshop = 1 and q4 >= 5
     THEN scoreB = 2 * q1 + q2;
     ELSE scoreB = 3 * q1 + q2;
 RUN;
 PROC PRINT; RUN;

DATA myLib.mydataTransformed;
SET myLib.mydata100;

IF      (posttest <  60)                 THEN postGroup = 1;
ELSE IF (posttest >= 60 & posttest < 70) THEN postGroup = 2;
ELSE IF (posttest >= 70 & posttest < 80) THEN postGroup = 3;
ELSE IF (posttest >= 80 & posttest < 90) THEN postGroup = 4;
ELSE IF (posttest >= 90)                 THEN postGroup = 5;
RUN;

PROC FREQ; TABLES postGroup; RUN;

PROC RANK OUT=myLib.mydataTransformed GROUPS=5;
  VAR posttest;
  RUN;

PROC FREQ; TABLES posttest; RUN;
CD 'C:\myRfolder'.
GET FILE = 'mydata.sav'.

DO IF (Q4 eq 5).
+ COMPUTE X1 = 1.
ELSE.
+ COMPUTE X1 = 0.
END IF.

DO IF (Q4 GE 4).
+ COMPUTE X2 = 1.
ELSE.
+ COMPUTE X2 = 0.
END IF.

DO IF (workshop EQ 1 AND q4 GE 4).
+ COMPUTE X3 = 1.
ELSE.
+ COMPUTE X3 = 0.
END IF.

DO IF (gender = 'f').
+ COMPUTE scoreA = 2 * q1 + q2.
ELSE.
+ COMPUTE scoreA = 3 * q1 + q2.
END IF.

DO IF (workshop EQ 1 AND q4 GE 5).
+ COMPUTE scoreB = 2 * q1 + q2.
ELSE.
+ COMPUTE scoreB = 3 * q1 + q2.
END IF.
EXECUTE.

GET FILE='mydata100.sav'.
DATASET NAME DataSet2 WINDOW=FRONT.

DO IF (posttest LT  60).
+ COMPUTE  postGroup = 1.
ELSE IF (posttest GE 60  AND posttest LT 70).
+ COMPUTE postGroup = 2.
ELSE IF (posttest GE 70 AND posttest LT 80).
+ COMPUTE postGroup = 3.
ELSE IF (posttest GE 80 AND posttest LT 90).
+ COMPUTE postGroup = 4.
ELSE IF (posttest GE 90).
+ COMPUTE postGroup = 5.
END IF.
EXECUTE.

DATASET ACTIVATE DataSet2.
FREQUENCIES VARIABLES=postGroup
  /ORDER=ANALYSIS.

DATASET ACTIVATE DataSet1.
RANK VARIABLES=posttest (A)
  /NTILES(5)
  /PRINT=YES
  /TIES=MEAN.

FREQUENCIES VARIABLES=Nposttes
  /ORDER=ANALYSIS.
* Filename: TransformIF.do

use c:\myRfolder\mydata, clear
preserve
if q4 == 5 {
gen x1=1
}
else {
gen x1=0
}
if q4>=4 {
gen x2=1
}
else {
gen x2=0
}
if workshop == 1 & q4>=5 {
gen x3=1
}
else {
gen x3=0
}
if gender=="f" {
gen scoreA = 2*q1+q2
}
else {
gen scoreA = 3*q1+q2
}
if workshop==1 & q4>5 {
gen scoreB = 2*q1+q2
}
else {
gen scoreB = 3*q1+q2
}
restore





Transformations, Multiple Conditional

 R SAS SPSS Stata
attach(mydata)
mydata

# Using the ifelse approach.
mydata$score1 <-
  ifelse( gender == "f",
  (2 * q1) + q2,  #1st for females.
  (20* q1) + q2   #1st for males.
)
mydata$score2 <-
ifelse( gender == "f",
  (3  * q1) + q2,  #2nd for females.
  (30 * q1) + q2   #2nd for males.
)
mydata

# Using the subscript approach.
load(file = "mydata.RData")

# Create names in data frame.
detach(mydata)
mydata <-
  data.frame(mydata,
  score1 = NA,
  score2 = NA)

attach(mydata)

# Find which are males and females.
gals <- which( gender == "f" )
guys <- which( gender == "m" )

# Now assign scores.
mydata[gals, "score1"] <- 
  2 * q1[gals] + q2[gals]
mydata[gals, "score2"] <- 
  3 * q1[gals] + q2[gals]
mydata[guys, "score1"] <-
  20 * q1[guys] + q2[guys]
mydata[guys, "score2"] <-
  30 * q1[guys] + q2[guys]
DATA myLib.mydata;
SET  myLib.mydata;

IF gender="f" THEN DO;
  score1 = (2 * q1) + q2;
  score2 = (3 * q1) + q2;
END;

ELSE IF gender="m" THEN DO;
  score1 = (20 * q1) + q2;
  score2 = (30 * q1) + q2;
END;
GET FILE = 'mydata.sav'.

DO IF (gender EQ 'm').
+  COMPUTE score1 = (2*q1) + q2.
+  COMPUTE score2 = (3*q1) + q2.
ELSE IF (gender EQ 'f').
+  COMPUTE  score1 = (20*q1) + q2.
+  COMPUTE  score2 = (30*q1) + q2.
END IF.
* Filename: TransformIF2.do

use c:\myRfolder\mydata, clear
preserve
if gender=="m" {
gen score1 = (20*q1)+q2
gen score2 = (30*q1)+q2
}
else if gender=="f" {
gen score1 = (2*q1) + q2
gen score2 = (3*q1) + q2
}
restore



Transposing or Flipping Data Sets

 R SAS SPSS Stata
# Filename: Transpose.R

setwd("c:/myRfolder")
load("mydata.RData")
mydata

myQs <- c("q1", "q2", "q3", "q4")
myQdf <- mydata[ ,myQs]
myQdf
myFlipped <- t(myQdf)
myFlipped
class(myFlipped)  # coerced into a matrix!
myFixed <- as.data.frame( t(myFlipped) )
myFixed

# Again, but with all the data
options(width = 60)
myFlipped <- t(mydata)
myFlipped

myFixed <- t(myFlipped)
myFixed
myFixed <- data.frame(myFixed)
str(myFixed)

myQs <- c("q1", "q2", "q3", "q4")
myFixed[ ,myQs] <-
  lapply(myFixed[ ,myQs], as.numeric)
str(myFixed)
* Filename: Transpose.sas ;

LIBNAME myLib 'C:\myRfolder';

PROC TRANSPOSE DATA=myLib.mydata OUT=mycopy;
PROC PRINT; RUN;

PROC TRANSPOSE DATA=mycopy OUT=myFixed;
PROC PRINT; RUN;
* Filename: Transpose.sps.

CD 'C:\myRfolder'.
GET FILE='mydata.sav'.
DATASET NAME DataSet1 WINDOW=FRONT.

FLIP VARIABLES=id workshop gender q1 q2 q3 q4.

FLIP VARIABLES=var001 var002 var003 var004
  var005 var006 var007 var008
  /NEWNAMES=CASE_LBL.




Z-scores, Calculating

 R SAS SPSS Stata
# Applying Z Transformation.

myZs <- apply(mymatrix, 2, scale)
myZs

* Get Z Scores;
PROC STANDARD DATA=mylib.mydata
  MEAN=0 STD=1 out=myZs;
RUN;
PROC PRINT;
RUN;
DESCRIPTIVES VARIABLES=q1 q2 q3 q4
  /SAVE
  /STATISTICS=MEAN STDDEV MIN MAX.