Data Frame

Data Structures

Read tabulator separated table.csv into a data.frame

# read .csv data file

myData = read.table('data_2x20.csv', header=TRUE)

# same, but defining explicitly the separator as tabulator: '\t'

myData = read.table('data_2x20.csv', header=TRUE, sep="\t", row.names=1 )

# download and read .csv data from a web address 

myData = read.table( url('http://www.nlpca.org/data_2x20.csv') , header=TRUE)

→ Files: read & write

Check type of imported variable 'myData'

Dataframe is a special type of a list, with class 'data.frame'.

typeof(myData)

  "list"


class(myData)

  "data.frame"

Get size and view content of a dataframe dataset

head(myData) # show top lines

         height width

sample01  6.576 3.644

sample02  6.379 3.110

sample03 10.542 4.213

sample04  4.543 2.954

sample05  6.092 3.248

sample06  8.804 3.907

tail(myData) # show last lines

         height width

sample15  6.636 3.293

sample16  9.965 4.176

sample17  7.012 2.990

sample18  9.197 3.977

sample19  9.837 3.872

sample20 10.841 4.197

colnames(myData) # names of columns

   "height" "width"

rownames(myData) # names of rows

  "sample01" "sample02" "sample03" "sample04" "sample05" "sample06"

  "sample07" "sample08" "sample09" "sample10" "sample11" "sample12"

  "sample13" "sample14" "sample15" "sample16" "sample17" "sample18"

  "sample19" "sample20"

dim(myData) # size of data

   20  2

str(myData) # show format and structure

'data.frame':    20 obs. of  2 variables:

 $ height: num  6.58 6.38 10.54 4.54 6.09 ...

 $ width : num  3.64 3.11 4.21 2.95 3.25 ...

colSums(myData) # get sum of each column

 height   width

 150.955  70.257

Add column to dataframe

# add product of columns "height" and "width"

myData$prod = myData$height * myData$width

# add group labels ( manual, or using rep() to replicate/repeat values )

myData$group = c("A","A","A","A","A","B","B","B","B","B","C","C","C","C","C","D","D","D","D","D")

myData$group = c(rep("A", 5),rep("B", 5),rep("C", 5),rep("D", 5))

# add new column "subject", initialize by having all values "NA"

myData$subject = NA

myData

         height width     prod group subject

sample01  6.576 3.644 23.96294     A      NA

sample02  6.379 3.110 19.83869     A      NA

sample03 10.542 4.213 44.41345     A      NA

sample04  4.543 2.954 13.42002     A      NA

sample05  6.092 3.248 19.78682     A      NA

sample06  8.804 3.907 34.39723     B      NA

sample07  6.924 3.543 24.53173     B      NA

°°°

Remove column from dataframe

# remove column "prod" from dataframe "myData"

mydata$prod = NULL

         height width group subject

sample01  6.576 3.644     A      NA

sample02  6.379 3.110     A      NA

sample03 10.542 4.213     A      NA

°°°

Replace all missing values 'NA' with zero '0'

myData[ is.na(myData) ] = 0     

         height width group subject

sample01  6.576 3.644     A       0

sample02  6.379 3.110     A       0

sample03 10.542 4.213     A       0

°°°

Get column subset

# get index position of column "height"

which(names(myData)=="width")

  2

# get all columns from 1 ("height") to 2 ("width")

datasubset = myData[,1:2]

         height width

sample01  6.576 3.644

sample02  6.379 3.110

sample03 10.542 4.213

# get selected columns:  3 ("group"),  1 ("height")  and  2 ("width")

datasubset = myData[,c(3,1,2)]

         group height width

sample01     A  6.576 3.644

sample02     A  6.379 3.110

sample03     A 10.542 4.213

→ data.frame subset

Convert data.frame to matrix

# read demo .csv data file into a data.frame

myData = read.table( url('http://www.nlpca.org/data_2x20.csv') , header=TRUE)

# converting data.frame to matrix, using   as.matrix()

myMatrix = as.matrix(myData)


# check type and class of variable "myData"

class(myData)

  "data.frame"

typeof(myData)

  "list"

# check type and class of converted variable "myMatrix"

class(myMatrix)

  "matrix"

typeof(myMatrix)

  "double"

!!! Take care that your data.frame contains only numbers. If a data.frame contains any text-based column such as the "group" column, the converted matrix will be completely a text matrix, including all numbers as text-elements  (type: "character" instead of "double"). 


For text-number mixed data.frames, 

use    matrix()   instead of    as.matrix() to keep number-columns numerical, and text-columns as text-character strings

myMatrix = matrix(myData)

use    data.matrix()   instead of    as.matrix() to get a pure numerical matrix by converting all text-elements to NA

myMatrix = data.matrix(myData)