User Tools

Site Tools


mkatari-bioinformatics-august-2013-clustering

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
mkatari-bioinformatics-august-2013-clustering [2014/12/11 15:24]
mkatari [K-means]
mkatari-bioinformatics-august-2013-clustering [2015/06/17 13:26] (current)
mkatari
Line 4: Line 4:
 ====== Clustering rna-seq data ====== ====== Clustering rna-seq data ======
 continuation from [[mkatari-bioinformatics-august-2013-deseq|DESeq]] continuation from [[mkatari-bioinformatics-august-2013-deseq|DESeq]]
 +
 +[[https://drive.google.com/file/d/0B172nc4dAaaORnh3MkZqUE9PVjA/view?usp=sharing|resSig.txt]]
 +[[https://drive.google.com/file/d/0B172nc4dAaaONXFIX2YxeDRCbkE/view?usp=sharing|normalized.txt]]
 +
 +In case you didn't get DESeq to work download and load the files above
 +
 +<code>
 +resSig = read.table("resSig.txt", header=T)
 +normalized = read.table("normalized.txt", header=T, row.names=1)
 +
 +</code>
  
 Get the significant genes Get the significant genes
Line 12: Line 23:
 Get the normalized values for the significant genes Get the normalized values for the significant genes
 <code> <code>
-sigGenes.normalized = normalized[sigGenes,]+sigGenes.normalized = normalized[as.character(sigGenes),]
 </code> </code>
  
Line 70: Line 81:
  
 <code> <code>
-# this function takes an vector to be calculated.+# this function takes vector of gene expression values.
 scaleData <- function(x) { scaleData <- function(x) {
   x = as.numeric(x)   x = as.numeric(x)
Line 78: Line 89:
   return(y)   return(y)
 } }
 +</code>
  
-#we need to transpose it because apply function returns the genes as different columns.+we need to transpose it because apply function returns the genes as different columns. 
 + 
 +<code>
 scaledSigGenes = t(apply(sigGenes.normalized, 1, scaleData)) scaledSigGenes = t(apply(sigGenes.normalized, 1, scaleData))
 colnames(scaledSigGenes)=colnames(sigGenes.normalized) colnames(scaledSigGenes)=colnames(sigGenes.normalized)
 +</code>
  
-#now to run k-means, in this case we are starting with 2 cluster+now to run k-means, in this case we are starting with 2 cluster.
-#just like for heirarchical clustering, we have to first transpose the data so compare genes.+
  
-SigGenes.kmeans.2 = kmeans(t(scaledSigGenes), 2)+<code> 
 +SigGenes.kmeans.2 = kmeans(scaledSigGenes, 2, nstart=25) 
 +</code>
  
-#a plot of the groups +To obtain the measure of how well the clustering has performedwe can look at the sum of squares between members of the outside group and sum of squares totalHigher the better.
-plot(SigGenes.kmeans.2$centers[1,], SigGenes.kmeans.2$centers[2,])+
  
-# a measure of how well the clustering has performed +<code>
-# it is the sum of squares between members of the outside group and sum of squares total +
-# higher the better.+
 SigGenes.kmeans.2$betweenss/SigGenes.kmeans.2$totss SigGenes.kmeans.2$betweenss/SigGenes.kmeans.2$totss
 +</code>
  
-#to get the genes in the different clusters+In order to determine the ideal number of k, we can try many different K's and look to see how well they performed. 
 + 
 +<code> 
 +getBestK <- function(x) { 
 +  kmeans_ss=numeric() 
 +  kmeans_ss[1]=0 
 +   
 +  for (i in 2:20) { 
 +     kmeans_tmp=kmeans(x, i, nstart=25) 
 +     #alternate way of looking at proportion of ss that is provided by between groups. 
 +     #kmeans_ss[i] = kmeans_tmp$betweenss/kmeans_tmp$totss    
 + 
 +     #using silhouette width to evaluate clusters. 
 +     kmeans_sil= (kmeans_tmp$betweenss-kmeans_tmp$withinss)/max(kmeans_tmp$betweenss, kmeans_tmp$withinss)  
 +     kmeans_ss[i] = mean(kmeans_sil) 
 + 
 + 
 +  } 
 +  return(kmeans_ss) 
 +
 + 
 +kmeans_ss=getBestK(scaledSigGenes) 
 +plot(kmeans_ss) 
 + 
 +</code> 
 +To get the genes in the different clusters 
 +<code>
 SigGenes.kmeans.2.group1 = names(which(SigGenes.kmeans.2$cluster==1)) SigGenes.kmeans.2.group1 = names(which(SigGenes.kmeans.2$cluster==1))
 SigGenes.kmeans.2.group2 = names(which(SigGenes.kmeans.2$cluster==2)) SigGenes.kmeans.2.group2 = names(which(SigGenes.kmeans.2$cluster==2))
 +</code>
 +
 +
 +The code below plots k-means clustering results. You simply have to provide the k-means output and the labels.
 +
 +<code>
 +plotClusterCenters<-function(kmeansres, 
 +                             myxlab="Treatment", 
 +                             myylab="Expression",
 +                             mymain="K-means Clusters") {
 +  
 +  mycolors=c("blue","red","green","orange","pink","black")
 +  centersdim = dim(kmeansres$centers)
 +  plot(kmeansres$centers[1,], 
 +       type="b", 
 +       col=mycolors[1], 
 +       xlab=myxlab,
 +       ylab=myylab,
 +       main=mymain,
 +       ylim=c(round(min(kmeansres$centers)),
 +                               round(max(kmeansres$centers))),
 +                               xaxt="n")
 +  
 +  axis(1, at=c(1:centersdim[2]), labels=names(kmeansres$centers[1,]))
 +  
 +  for (i in 2:centersdim[1]) {
 +    lines(kmeansres$centers[i,], type="b", col=mycolors[i])
 +  }
 +  
 +}
 +
  
 +plotClusterCenters(SigGenes.kmeans.2)
 </code> </code>
  
Line 107: Line 179:
  
 <code> <code>
 +install.packages("gplots")
 library(gplots) library(gplots)
 </code> </code>
Line 126: Line 199:
 <code> <code>
 pdf("heatmap.pdf") pdf("heatmap.pdf")
-heatmap.2(sigGenes.normalized, +heatmap.2(as.matrix(sigGenes.normalized)
           col=redgreen(75),           col=redgreen(75),
           hclustfun=hclust2,           hclustfun=hclust2,
mkatari-bioinformatics-august-2013-clustering.1418311494.txt.gz · Last modified: 2014/12/11 15:24 by mkatari