Dataframe

Read tabulator separated table.csv into a dataframe

# import file from disk

mydata = read.table('data_2x20.csv', header=TRUE)

# same, but defining explicitly the separator as tabulator: '\t'

mydata = read.table('data_2x20.csv', header=TRUE, sep="\t", row.names=1 )

# direct download and import file

mydata = read.table( url('http://www.nlpca.org/data_2x20.csv') , header=TRUE)

Check type of imported variable 'mydata'

Dataframe is a special type of a list, with class 'data.frame'.

typeof(mydata)

"list"

class(mydata)

"data.frame"

Get size and view content of a dataframe dataset

head(mydata) # show top line

height width

sample01 6.576 3.644

sample02 6.379 3.110

sample03 10.542 4.213

sample04 4.543 2.954

sample05 6.092 3.248

sample06 8.804 3.907

tail(mydata) # show last line

height width

sample15 6.636 3.293

sample16 9.965 4.176

sample17 7.012 2.990

sample18 9.197 3.977

sample19 9.837 3.872

sample20 10.841 4.197

colnames(mydata) # names of columns

"height" "width"

rownames(mydata) # names of rows

"sample01" "sample02" "sample03" "sample04" "sample05" "sample06"

"sample07" "sample08" "sample09" "sample10" "sample11" "sample12"

"sample13" "sample14" "sample15" "sample16" "sample17" "sample18"

"sample19" "sample20"

dim(mydata) # size of data

20 2

str(mydata) # show format and structure

'data.frame': 20 obs. of 2 variables:

$ height: num 6.58 6.38 10.54 4.54 6.09 ...

$ width : num 3.64 3.11 4.21 2.95 3.25 ...

colSums(mydata) # get sum of each column

height width

150.955 70.257

Add column to dataframe

# add product of columns "height" and "width")

mydata$prod = mydata$height * mydata$width

# add group labels

mydata$group = c("A","A","A","A","A","B","B","B","B","B","C","C","C","C","C","D","D","D","D","D")

# add new column "subject", initialize by having all values "NA"

mydata$subject = NA

mydata

height width prod group subject

sample01 6.576 3.644 23.96294 A NA

sample02 6.379 3.110 19.83869 A NA

sample03 10.542 4.213 44.41345 A NA

sample04 4.543 2.954 13.42002 A NA

sample05 6.092 3.248 19.78682 A NA

sample06 8.804 3.907 34.39723 B NA

sample07 6.924 3.543 24.53173 B NA

°°°

Remove column from dataframe

# remove column "prod" from dataframe "mydata"

mydata$prod = NULL

height width group subject

sample01 6.576 3.644 A NA

sample02 6.379 3.110 A NA

sample03 10.542 4.213 A NA

°°°

Replace all missing values 'NA' with zero '0'

mydata[ is.na(mydata) ] = 0

height width group subject

sample01 6.576 3.644 A 0

sample02 6.379 3.110 A 0

sample03 10.542 4.213 A 0

°°°

Get column subset

# get index position of column "height"

which(names(mydata)=="width")

2

# get all columns from 1 ("height") to 2 ("width")

datasubset = mydata[,1:2]

height width

sample01 6.576 3.644

sample02 6.379 3.110

sample03 10.542 4.213

# get selected columns: 3 ("group"), 1 ("height") and 2 ("width")

datasubset = mydata[,c(3,1,2)]

group height width

sample01 A 6.576 3.644

sample02 A 6.379 3.110

sample03 A 10.542 4.213

read more: → dataframe subset