Data structures‎ > ‎

Dataframe



Read tabulator separated table.csv into a dataframe


# import file from disk
mydata = read.table('data_2x20.csv', header=TRUE)

# same, but defining explicitly the separator as tabulator: '\t'
mydata = read.table('data_2x20.csv', header=TRUE, sep="\t", row.names=1 )

# direct download and import file
mydata = read.table( url('http://www.nlpca.org/data_2x20.csv') , header=TRUE)

Check type of imported variable 'mydata'

Dataframe is a special type of a list, with class 'data.frame'.

typeof(mydata)
  "list"

class(mydata)
  "data.frame"

Get size and view content of a dataframe dataset

head(mydata)  # show top line
         height width
sample01  6.576 3.644
sample02  6.379 3.110
sample03 10.542 4.213
sample04  4.543 2.954
sample05  6.092 3.248
sample06  8.804 3.907


tail(mydata) # show last line
         height width
sample15  6.636 3.293
sample16  9.965 4.176
sample17  7.012 2.990
sample18  9.197 3.977
sample19  9.837 3.872
sample20 10.841 4.197


colnames(mydata) # names of columns
   "height" "width"

rownames(mydata) # names of rows
  "sample01" "sample02" "sample03" "sample04" "sample05" "sample06"
  "sample07" "sample08" "sample09" "sample10" "sample11" "sample12"
  "sample13" "sample14" "sample15" "sample16" "sample17" "sample18"
  "sample19" "sample20"



dim(mydata) # size of data
   20  2

str(mydata) # show format and structure
'data.frame':    20 obs. of  2 variables:
 $ height: num  6.58 6.38 10.54 4.54 6.09 ...
 $ width : num  3.64 3.11 4.21 2.95 3.25 ...

colSums(mydata) # get sum of each column
 height   width
 150.955  70.257



Add column to dataframe

# add product of columns "height" and "width")
mydata$prod = mydata$height * mydata$width

# add group labels
mydata$group = c("A","A","A","A","A","B","B","B","B","B","C","C","C","C","C","D","D","D","D","D")

# add new column "subject", initialize by having all values "NA"
mydata$subject = NA

mydata
         height width     prod group subject
sample01  6.576 3.644 23.96294     A      NA
sample02  6.379 3.110 19.83869     A      NA
sample03 10.542 4.213 44.41345     A      NA
sample04  4.543 2.954 13.42002     A      NA
sample05  6.092 3.248 19.78682     A      NA
sample06  8.804 3.907 34.39723     B      NA
sample07  6.924 3.543 24.53173     B      NA

°°°

Remove column from dataframe

# remove column "prod" from dataframe "mydata"
mydata$prod = NULL
         height width group subject
sample01  6.576 3.644     A      NA
sample02  6.379 3.110     A      NA
sample03 10.542 4.213     A      NA
°°°

Replace all missing values 'NA' with zero '0'

mydata[ is.na(mydata) ] = 0     
         height width group subject
sample01  6.576 3.644     A       0
sample02  6.379 3.110     A       0
sample03 10.542 4.213     A       0
°°°

Get column subset

# get index position of column "height"
which(names(mydata)=="width")
  2

# get all columns from 1 ("height") to 2 ("width")
datasubset = mydata[,1:2]
         height width
sample01  6.576 3.644
sample02  6.379 3.110
sample03 10.542 4.213

# get selected columns:  3 ("group"),  1 ("height")  and  2 ("width")
datasubset = mydata[,c(3,1,2)]
         group height width
sample01     A  6.576 3.644
sample02     A  6.379 3.110
sample03     A 10.542 4.213







Subpages (1): Subset