mkatari-bioinformatics-august-2013-clustering
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revisionNext revisionBoth sides next revision | ||
mkatari-bioinformatics-august-2013-clustering [2014/12/11 15:24] – [K-means] mkatari | mkatari-bioinformatics-august-2013-clustering [2015/06/17 12:48] – mkatari | ||
---|---|---|---|
Line 4: | Line 4: | ||
====== Clustering rna-seq data ====== | ====== Clustering rna-seq data ====== | ||
continuation from [[mkatari-bioinformatics-august-2013-deseq|DESeq]] | continuation from [[mkatari-bioinformatics-august-2013-deseq|DESeq]] | ||
+ | |||
+ | [[https:// | ||
+ | [[https:// | ||
+ | |||
+ | In case you didn't get DESeq to work download and load the files above | ||
+ | |||
+ | < | ||
+ | resSig = read.table(" | ||
+ | normalized = read.table(" | ||
+ | |||
+ | </ | ||
Get the significant genes | Get the significant genes | ||
Line 12: | Line 23: | ||
Get the normalized values for the significant genes | Get the normalized values for the significant genes | ||
< | < | ||
- | sigGenes.normalized = normalized[sigGenes, | + | sigGenes.normalized = normalized[as.character(sigGenes),] |
</ | </ | ||
Line 70: | Line 81: | ||
< | < | ||
- | # this function takes an vector | + | # this function takes a vector |
scaleData <- function(x) { | scaleData <- function(x) { | ||
x = as.numeric(x) | x = as.numeric(x) | ||
Line 78: | Line 89: | ||
return(y) | return(y) | ||
} | } | ||
+ | </ | ||
- | #we need to transpose it because apply function returns the genes as different columns. | + | we need to transpose it because apply function returns the genes as different columns. |
+ | |||
+ | < | ||
scaledSigGenes = t(apply(sigGenes.normalized, | scaledSigGenes = t(apply(sigGenes.normalized, | ||
colnames(scaledSigGenes)=colnames(sigGenes.normalized) | colnames(scaledSigGenes)=colnames(sigGenes.normalized) | ||
+ | </ | ||
- | #now to run k-means, in this case we are starting with 2 cluster. | + | now to run k-means, in this case we are starting with 2 cluster. |
- | #just like for heirarchical clustering, we have to first transpose the data so compare genes. | + | |
- | SigGenes.kmeans.2 = kmeans(t(scaledSigGenes), 2) | + | < |
+ | SigGenes.kmeans.2 = kmeans(scaledSigGenes, | ||
+ | </ | ||
- | #a plot of the groups | + | To obtain the measure |
- | plot(SigGenes.kmeans.2$centers[1,], SigGenes.kmeans.2$centers[2, | + | |
- | # a measure of how well the clustering has performed | + | < |
- | # it is the sum of squares between members of the outside group and sum of squares total | + | |
- | # higher the better. | + | |
SigGenes.kmeans.2$betweenss/ | SigGenes.kmeans.2$betweenss/ | ||
+ | </ | ||
- | #to get the genes in the different clusters | + | In order to determine the ideal number of k, we can try many different K's and look to see how well they performed. |
+ | |||
+ | < | ||
+ | getBestK <- function(x) { | ||
+ | kmeans_ss=numeric() | ||
+ | kmeans_ss[1]=0 | ||
+ | |||
+ | for (i in 2:20) { | ||
+ | | ||
+ | #alternate way of looking at proportion of ss that is provided by between groups. | ||
+ | # | ||
+ | |||
+ | # | ||
+ | | ||
+ | | ||
+ | |||
+ | |||
+ | } | ||
+ | return(kmeans_ss) | ||
+ | } | ||
+ | |||
+ | kmeans_ss=getBestK(scaledSigGenes) | ||
+ | plot(kmeans_ss) | ||
+ | |||
+ | </ | ||
+ | To get the genes in the different clusters | ||
+ | < | ||
SigGenes.kmeans.2.group1 = names(which(SigGenes.kmeans.2$cluster==1)) | SigGenes.kmeans.2.group1 = names(which(SigGenes.kmeans.2$cluster==1)) | ||
SigGenes.kmeans.2.group2 = names(which(SigGenes.kmeans.2$cluster==2)) | SigGenes.kmeans.2.group2 = names(which(SigGenes.kmeans.2$cluster==2)) | ||
+ | </ | ||
+ | |||
+ | |||
+ | The code below plots k-means clustering results. You simply have to provide the k-means output and the labels. | ||
+ | |||
+ | < | ||
+ | plotClusterCenters< | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | mycolors=c(" | ||
+ | centersdim = dim(kmeansres$centers) | ||
+ | plot(kmeansres$centers[1, | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | axis(1, at=c(1: | ||
+ | | ||
+ | for (i in 2: | ||
+ | lines(kmeansres$centers[i, | ||
+ | } | ||
+ | | ||
+ | } | ||
+ | |||
+ | plotClusterCenters(SigGenes.kmeans.2) | ||
</ | </ | ||
mkatari-bioinformatics-august-2013-clustering.txt · Last modified: 2015/06/17 13:26 by mkatari