### source codes of manuscript "Combinatorial Network of Transcriptional Regulation and microRNA Regulation in Human Cancer" ### Author: Hui Yu (yuhui@scbit.org) ### We would like to thank Prof. Dennis D. Boos for publicizing the lars-based LASSO wrapper function lasso.dr1(). (http://www4.stat.ncsu.edu/~boos/var.select/lasso.wrapper.html) ###miRNAII.regression(miRNA.exp,gene.exp,tf2gene,tf2mir,mir2gene,p.value=0.05,step.do=F,prefilter=F,penalty=c('AIC','BIC')[1]) ): using stepwise linear regression or LASSO regression to sift forward-predicted three types of regulations according to parallel expression datasets. step.do=T/F: STEP/LASSO; prefilter=T/F: implement univarate regression prefiltration or not. ###vertex.edge.feature(reg2tar): calculates the vertex features (in degree, out degree, betweenness) and edge feature (betweenness). ###coregulating.pairs(reg2tar,p.cutoff=0.01): identifies the coordinating regulator pairs with significantly high portion of overlapped targets. p.cutoff: threshold of fisher's exact test p-value. ###motif.p(node2node2sign2type,n): identifies the significantly recurrent network motifs. n: times of network shufflings. miRNAII.regression <- function(miRNA.exp,gene.exp,tf2gene,tf2mir,mir2gene,p.value=0.05,step.do=T,prefilter=T,penalty=c('AIC','BIC')[1]) { # F:\K\webserver\miR-II Rs\tf.mir.regression.RData as working image. # miRNA.exp: 195 * 60 data matrix, rownames ( "hsa-let-7a" "hsa-let-7b" "hsa-let-7c" "hsa-let-7d" "hsa-let-7e" ...); colname order of miRNA.exp and gene.exp SHOULD BE identical. # gene.exp: 8388 * 59 data matrix, rownames ("15E1.2" "2'-PDE" "76P" "A2BP1" "AACS" ...); colname order of miRNA.exp and gene.exp SHOULD BE identical. # tf2gene, tf2mir, mir2gene: two-column data matrices. # step.do: allowing user to decide to use 'stepwise' linear regression or not. # output: #########tf2mir.out: (inferred in Module I) four columns: tf, mir, coef, pval #########reg2gene.out: inferred in Module II, including tf2gene.out and mir2gene.out conceptually. # tf2mir <- unique(tf2mir) mir2gene <- unique(mir2gene) tf2gene <- unique(tf2gene) # Module I: inferring regulations towards mirs. mirs.as.target <- intersect(rownames(miRNA.exp),unique(tf2mir[,2])) #tfs.as.regulator <- intersect(rownames(gene.exp),unique(tf2mir[,1])) tf2mir.out <- NULL # to be expanded through rbinding data blocks. cat('now MODULE I: inferring tf->mirs...\n') cat.i = 0 questionable = character(0) for (i in 1:length(mirs.as.target)) { if ( (i*100/length(mirs.as.target))%/%10>cat.i ) { cat.i = cat.i+1 cat(cat.i*10,'%', '\n') } mir.i <- mirs.as.target[i] cat(mir.i,': \n') tfs.to.i <- intersect(tf2mir[tf2mir[,2]==mir.i,1],rownames(gene.exp)) if (length(tfs.to.i)>=1) { if (length(tfs.to.i)==1) { d <- data.frame(mir<-miRNA.exp[mir.i,],gene.exp[tfs.to.i,]) # I'm not sure whether I should use t(gene.exp[tfs.to.i,]) colnames(d)<-c('mir',tfs.to.i) } else { d <- data.frame(mir<-miRNA.exp[mir.i,],t(gene.exp[tfs.to.i,])) # t(miRNA.exp[mir.i,]) is questionable. If so, the two branches can be combined. colnames(d)<-c('mir',tfs.to.i) } if (prefilter) { ix <- c(1) c<-NULL for (j in 2:ncol(d)) { #cat('\t',j,' of ',ncol(d),' ') d.j<-d[,c(1,j)] #cat('extracting univariable step\n') m.j<-glm(mir~.,data=d.j) #cat('passed univariable step\n') if (summary(m.j)$coef[2,4]1) { #m<-glm(mir~.,data=d[,ix]) #cat('multiple variable regression finished.\n') if (step.do) { m<-glm(mir~.,data=d[,ix]) if (penalty=='AIC') { k<-2 } else { k<-log(length(ncol(miRNA.exp))) # orginally from "k<-log(length(logfc))" in miR-I code. } sink('/dev/null') m<-step(m,k=k,silent=T) # silent unuseful? sink() c<-summary(m)$coef tfs.to.i <- setdiff(rownames(c),c('(Intercept)')) coef.to.i <- c[tfs.to.i,1] pval.to.i <- c[tfs.to.i,4] tf2mir.out <- rbind(tf2mir.out,data.frame(regulator=tfs.to.i,coef=coef.to.i,pval=pval.to.i,target=mir.i)) } else { #cat('las returned.\n') #sink('/dev/null/') #cat(colnames(d[-1]),'\n') if (length(ix)==2) { noise = rnorm(nrow(d),nrow(d),1) d <- data.frame(d,noise) ix = seq(3) questionable = c(questionable,mir.i) #object = lasso.dr1(d[,ix[-1]],d[,1]) } object = lasso.dr1(d[,ix[-1]],d[,1]) #sink() # cat('las returned.\n') #save.image('debugging.RData') if (length(object$variables)>0) { tfs.to.i <- object$variables #cat(tfs.to.i,'\n') coef.to.i <- object$coeff error.to.i <- object$mse target <- mir.i tf2mir.out <- rbind(tf2mir.out,data.frame(regulator=tfs.to.i,coef=coef.to.i,error=error.to.i,target=mir.i)) } } #cat('step finished.\n') } } } # inferring regulations towards genes. genes.as.target <- intersect(rownames(gene.exp),union(unique(tf2gene[,2]),unique(mir2gene[,2]))) cat('now MODULE II: inferring regulators->genes...'); cat('\n') reg2gene.out<-NULL cat.i = 0 for (i in 1:length(genes.as.target)) { gene.i <- genes.as.target[i] tfs.to.i <- intersect(tf2gene[tf2gene[,2]==gene.i,1],rownames(gene.exp)) mirs.to.i <- intersect(mir2gene[mir2gene[,2]==gene.i,1],rownames(miRNA.exp)) #cat(i*100/length(genes.as.target),'% ',gene.i,' ',length(tfs.to.i)+length(mirs.to.i),'candidates: \n') if (length(tfs.to.i)+length(mirs.to.i)>=1) { if (length(tfs.to.i)+length(mirs.to.i)==1) { if (length(tfs.to.i)==1) { d <- d <- data.frame(gene<-gene.exp[gene.i,],gene.exp[tfs.to.i,]) } else { d <- d <- data.frame(gene<-gene.exp[gene.i,],miRNA.exp[mirs.to.i,]) } } else { d <- data.frame(gene<-gene.exp[gene.i,],t(rbind(gene.exp[tfs.to.i,],miRNA.exp[mirs.to.i,]))) } colnames(d)<-c('gene',tfs.to.i,mirs.to.i) ix <- c(1) if ( (i*100/length(genes.as.target))%/%5>cat.i ) { cat.i = cat.i+1 cat(cat.i*5,'%', '\n') } #cat(i*100/length(genes.as.target),'% ') #cat(i*100/length(genes.as.target),'% \n') #,gene.i,' ',length(tfs.to.i)+length(mirs.to.i),'candidates:\n ') #cat(gene.i,': ') if (prefilter) { for (j in 2:ncol(d)) { d.j<-d[,c(1,j)] m.j<-glm(gene~.,data=d.j) if (summary(m.j)$coef[2,4]1) { m<-glm(gene~.,data=d[,ix]) if (step.do) { m<-glm(gene~.,data=d[,ix])^M if (penalty=='AIC') { k<-2 } else { k<-log(length(ncol(miRNA.exp))) # orginally from "k<-log(length(logfc))" in miR-I code. } sink('/dev/null') m<-step(m,k=k,silent=T) sink() c<-summary(m)$coef^M reg.to.i <- setdiff(rownames(c),c('(Intercept)'))^M coef.to.i <- c[reg.to.i,1]^M pval.to.i <- c[reg.to.i,4]^M reg2gene.out <- rbind(reg2gene.out,data.frame(regulator=as.character(reg.to.i),coef=coef.to.i,pval=pval.to.i,target=as.character(gene.i)))^M } else { if (length(ix)==2) { noise = rnorm(nrow(d),nrow(d),1) d <- data.frame(d,noise) ix = seq(3) questionable = c(questionable,gene.i) #object = lasso.dr1(d[,ix[-1]],d[,1]) } object = lasso.dr1(d[,ix[-1]],d[,1]) if (length(object$variables)>0) { reg.to.i <- object$variables coef.to.i <- object$coeff error.to.i <- object$mse target <- mir.i if (sum(grepl('hsa-',reg.to.i,fixed=T))>0) { mir.pos <- grepl('hsa-',reg.to.i,fixed=T) if (sum(coef.to.i[mir.pos]>0) ==0 ) reg2gene.out <- rbind(reg2gene.out,data.frame(regulator=as.character(reg.to.i),coef=coef.to.i,error=error.to.i,target=as.character(gene.i))) } else reg2gene.out <- rbind(reg2gene.out,data.frame(regulator=as.character(reg.to.i),coef=coef.to.i,error=error.to.i,target=as.character(gene.i))) } } } } } rownames(reg2gene.out)<-NULL #cat('out of two modules\n') #reg2tar.out <- rbind(tf2mir.out,reg2gene.out) #colnames(reg2tar.out)<- c('regulator','coefficient','p.value','target') write.table(rbind(tf2mir.out,reg2gene.out),'reg2tar.filtered.txt',sep='\t',quote=F,row.names=F) cat('regression finished!\n') reg2tar.res <- list(tf2mir=tf2mir.out,reg2gene=reg2gene.out) matrix.reg2tar.res <- function(reg2tar.res) { type.tf2mir = rep('tf2mir',nrow(reg2tar.res$tf2mir)) types = rep('tf2gene',nrow(reg2tar.res$reg2gene)) attach(reg2tar.res$reg2gene) types[grepl('hsa-',regulator,fixed=T)] <- 'mir2gene' #need an exact parameter detach(reg2tar.res$reg2gene) type = c(type.tf2mir,types) reg2tar.out <- rbind(reg2tar.res$tf2mir,reg2tar.res$reg2gene) reg2tar.out <- data.frame(reg2tar.out,type=type) reg2tar.out = reg2tar.out[,c('type','regulator','target','coef','error')] } write.table(questionable,'questionable.txt',quote=F) reg2tar.out <- matrix.reg2tar.res(reg2tar.res) } ###################################################################################################################### ################################vertex.edge.feature (with powerlaw)################################################### ###################################################################################################################### vertex.edge.feature <- function(reg2tar) { # input 'reg2tar': two columns of regulators and targets (three types tf2mir,tf2gene, and mir2gene). # outputs a list of two components: # v.properties: vertices, in.degree, out.degree, betweenness # e.properties: e.v1,e.v2,betweenness. if (require(igraph)) { #reg2tar <- rbind(reg2tar$tf2mir,reg2tar$reg2gene) #reg2tar <- reg2tar[,c(1,4)] #g<-graph.empty() #g<-add.edges(g,reg2tar) g <- make.graph(t(reg2tar)) vertices = V(g)$name v.in.degree = degree(g,mode='in') v.out.degree = degree(g,mode='out') #power.law(v.out.degree,'power law.tff') v.betweenness = betweenness(g) e.betweenness = edge.betweenness(g) v.properties = data.frame(vertices=vertices,in.degree=v.in.degree,out.degree=v.out.degree,betweenness=v.betweenness) edges = get.edgelist(g) type = rep('tf2gene',nrow(edges)) type[grepl('hsa-',edges[,1],fixed=T)] <- 'mir2gene' type[grepl('hsa-',edges[,2],fixed=T)] <- 'tf2mir' #need an exact parameter e.properties = data.frame(type=type,e.v1=edges[,1],e.v2=edges[,2],betweenness=e.betweenness) #write.table(v.properties,'v.properties.txt',row.names=F,sep='\t',quote=F) #write.table(e.properties,'e.properties.txt',row.names=F,sep='\t',quote=F) propoerty = list(vertex=v.properties,edge=e.properties) } } power.law <- function(k,fig='power law.tiff') { logk=log2(k);logk[-logk==Inf]=0 breaks = min(logk)+0:20*((max(logk)-min(logk))/20) hst = hist(logk,breaks=breaks) logk=(breaks[1:20]+breaks[2:21])/2 logpk=log2(hst$counts); logpk[-logpk==Inf]=0 m=glm(y~x,data=data.frame(y=logpk,x=logk)) equation = paste('y = ',substr(coefficients(m)['(Intercept)'],1,5),' + ',substr(coefficients(m)['x'],1,4),'x',sep='') corr = cor(x=logk,y=logpk,method='pearson') main.txt=paste(equation,' r=',substr(corr,1,5),' 20 points') #y=predict(m,newdata=data.frame(logk)) #cat(logpk) tiff(fig,width=1028,height=768) plot(logk,logpk,xlim=c(min(logk)-1,max(logk)+1),ylim=c(min(logpk)-1,max(logpk)+1),pch=16,xlab='logrithm of degree at base 2',ylab='logrithm of number of nodes',main=main.txt) #lines(logk,fitted(y)) dev.off() } #make.graph <- function (edges) { # nodes<-unique(as.character(edges)) # nodes.i<-1:length(nodes)-1; names(nodes.i)<-nodes # edges.i<-nodes.i[edges] # g<-graph(edges.i,directed=T) # V(g)$name<-nodes # g #}; ###################################################################################################################### ################################coregulating.pairs#################################################################### ###################################################################################################################### coregulating.pairs <- function(reg2tar,p.cutoff=0.01) { #input: reg2tar: two columns of regulators and targets. #output: a matrix of information on coregulating pairs. # check if self-regulations exist. if yes, run out with a warning. g <- make.graph(t(reg2tar)) colnames(reg2tar)=c('reg','tar') node.deg = data.frame(node=V(g)$name,deg=degree(g, mode='out')) reg1.reg2.tar = merge(reg2tar,reg2tar,by.x='tar',by.y='tar') reg1.reg2.tar = reg1.reg2.tar[,c(2,3,1)] colnames(reg1.reg2.tar) = c('reg1','reg2','coTar') reg1.reg2.tar=reg1.reg2.tar[as.character(reg1.reg2.tar[,1])!=as.character(reg1.reg2.tar[,2]),] # after this step, each binary tuple is still repeated with reversed order. reg1.reg2.coTarCounts = as.data.frame(table(reg1.reg2.tar[1:2])) reg1.reg2.coTarCounts = reg1.reg2.coTarCounts[reg1.reg2.coTarCounts[,3]!=0,] colnames(reg1.reg2.coTarCounts)= c('reg1','reg2','coTarCounts') count.table.deg1 = merge(reg1.reg2.coTarCounts,node.deg,by.x='reg1',by.y='node') count.table.deg12 = merge(count.table.deg1,node.deg,by.x='reg2',by.y='node') count.table.deg12 = count.table.deg12[,c(2,1,3,4,5)] colnames(count.table.deg12) = c('reg1','reg2','coCount','count1','count2') reg2tar.logic = data.frame(reg2tar,edge=1) count.table.deg12 = merge(count.table.deg12,reg2tar.logic,by.y=c('reg','tar'),by.x=c('reg1','reg2'),all.x=T) colnames(count.table.deg12)[6] = 'reg1.to.reg2' count.table.deg12 = merge(count.table.deg12,reg2tar.logic,by.y=c('reg','tar'),by.x=c('reg2','reg1'),all.x=T) colnames(count.table.deg12)[7] = 'reg2.to.reg1' #colnames(count.table.deg12)[3:4] = c('reg1reg2','reg2reg1') count.table.deg12 = count.table.deg12[,c(2,1,3:7)] count.table.deg12[is.na(count.table.deg12)] = 0 a = count.table.deg12$coCount b = count.table.deg12$count1-a-count.table.deg12$reg1.to.reg2 c = count.table.deg12$count2-a-count.table.deg12$reg2.to.reg1 d = vcount(g)-2-(a+b+c) rowwise.fisher = function(x) { fisher.test(matrix(x,nc=2,byrow=T),alternative='g')$p.value } # a coregulating pair where A->B or A<-B needs to be specially considered. fisher.pval = apply(data.frame(a,b,c,d),1,rowwise.fisher) m = data.frame(count.table.deg12,fisher.pval=fisher.pval) m = m[m$fisher.pval<=p.cutoff,] type = rep('tf.tf',times=nrow(m)) type[grepl('hsa-',m$reg1,fixed=T) | grepl('hsa-',m$reg2,fixed=T)]='tf.mir' type[grepl('hsa-',m$reg1,fixed=T) & grepl('hsa-',m$reg2,fixed=T)]='mir.mir' out = data.frame(type=type,m) ix = order(out$fisher.pval,out$type,out$coCount,out$count1+out$count2) out = out[ix,] ix.half = seq(from=2,by=2,to=nrow(out)) out = out[ix.half,] ix = order(out$type,out$fisher.pval) out = out[ix,] } make.graph <- function (edges) { if (require(igraph)) { nodes<-unique(as.character(edges)) nodes.i<-1:length(nodes)-1; names(nodes.i)<-nodes edges.i<-nodes.i[edges] g<-graph(edges.i,directed=T) V(g)$name<-nodes g } }; motif.p <- function(node2node2sign2type,n) { # input: node2node2sign2type,n # output: a matrix with three columns: motif names, counts, and p. p <- rep(1,18) count.s <- matrix(0,nr=n,nc=18) names(p) <- colnames(count.s) <- c(paste('FBL',c('++','+-','-+','--'),sep='.'),paste('FFL.MB',c('+','-'),sep='.'),paste('FFL.MM',c('++','+-','-+','--'),sep='.'),paste('FFL.ME',c('+++','++-','-++','-+-','--+','---','+-+','+--'),sep='.')) #cat(names(p),'\n') for (i in 1:n) { cat('network shuffling ',i*100/n,'%\n') g.s <- combinat.network.random(node2node2sign2type) e.s <- get.edgelist(g.s) sign <- E(g.s)$sign node2node2sign.s <- data.frame(e.s,sign) res <- counting.motifs(node2node2sign.s) #cat(res$count,'\n') count.s[i,] <- res$count } res.motif <- counting.motifs(node2node2sign2type[,1:3]) count <- res.motif$count for (i in 1:18) { p[i] <- sum(count.s[,i]>=count[i])/n } data.frame(motif=names(p),count=count,p=p) } combinat.network.random <- function(node2node2sign2type) { node2node2sign = node2node2sign2type[,1:3] type = as.character(node2node2sign2type[,4]) type.unique = unique(type) e.s = matrix(nr=0,nc=2); sign = character(0) for (i in 1:length(type.unique)) { node2node2sign.i = node2node2sign[type==type.unique[i],] #cat('before a type.\n') g.s.i = type.network.random(node2node2sign.i) #cat('after a type.\n') e.s.i = get.edgelist(g.s.i) sign.i = E(g.s.i)$sign e.s = rbind(e.s,e.s.i) sign = c(sign,sign.i) } #cat('e.s have seemingly been generated.\n') g.s = graph.data.frame(e.s,directed=T) E(g.s)$sign = sign E(g.s)$type = type g.s } type.network.random <- function(node2node2sign) { if (require(igraph)) { g <- graph.data.frame(node2node2sign[,1:2], directed=T) E(g)$sign = node2node2sign[,3] d <- sapply(c('out','in'), function(mode) {degree(graph=g, mode=mode)}) #set.seed(1) # Is this sentence necessary? Why is it necessary? g.s <- simplify(degree.sequence.game(out.deg=d[,'out'], in.deg=d[,'in'])) V(g.s)$name = V(g)$name d.s <- sapply(c('out','in'), function(mode) {degree(graph=g.s, mode=mode)}) d.delta <- d - d.s e.s = get.edgelist(g.s) #cat('before the iteration\n') rep = 0 while ( sum(d.delta[,1]!=0)>1 | sum(d.delta[,2]!=0)>1 ) { rep = rep+1 #cat('enter repetitive network shuffling times:',rep,'\n') # ix <- which(d.delta[,1] != 0 | d.delta[,2] != 0) g.s.2 <- simplify(degree.sequence.game(out.deg=d.delta[,'out'], in.deg=d.delta[,'in'])) V(g.s.2)$name = V(g)$name d.s.2 <- sapply(c('out','in'), function(mode) {degree(graph=g.s.2, mode=mode)}) d.s = d.s + d.s.2 d.delta = d - d.s e.s.2 = get.edgelist(g.s.2) e.s = rbind(e.s,e.s.2) #cat('in.delta ',sum(d.delta[,1]!=0),' out.delta ',sum(d.delta[,2]!=0),'\n') #g.s = graph(t(e.s,directed=T)) #d.s <- sapply(c('out','in'), function(mode) {degree(graph=g.s, mode=mode)}) } #cat('after the iteration\n') g.s = graph.data.frame(e.s,directed=T) V(g.s)$name = V(g)$name E(g.s)$sign = E(g)$sign g.s } else { cat('Sorry, you must install igraph first.\n') } } # suppose 'sign' is indicated by {1,-1} # input: node2node2sign: three columns: node1,node2,sign. # output: list(cout, instance) ####### count: a vector of 18 elements, with names specifying all possible 18 three-vertexed motifs. ####### instance: all instances of each subtype motif. counting.motifs <- function(node2node2sign) { # output: motif count and motif instances, two componenets of a list. colnames(node2node2sign) = c('reg','tar','sign') gene2gene2sign = node2node2sign[!grepl('hsa-',node2node2sign$reg,fixed=T) & !grepl('hsa-',node2node2sign$tar,fixed=T),] mir2gene2sign = node2node2sign[grepl('hsa-',node2node2sign$reg,fixed=T) & !grepl('hsa-',node2node2sign$tar,fixed=T),] gene2mir2sign = node2node2sign[!grepl('hsa-',node2node2sign$reg,fixed=T) & grepl('hsa-',node2node2sign$tar,fixed=T),] # identify the four FBL motifs. triple.MM = merge(gene2mir2sign,mir2gene2sign,by.x='tar',by.y='reg') colnames(triple.MM) = c('mir','reg.of.mir','sign.reg2mir','tar.of.mir','sign.mir2tar') triple.MM = triple.MM[as.character(triple.MM$reg.of.mir) != as.character(triple.MM$tar.of.mir),] FBL = merge(triple.MM,gene2gene2sign,by.x=c('reg.of.mir','tar.of.mir'),by.y=c('tar','reg')) #FBL = triple.MM[ paste(triple.MM$tar.of.mir,triple.MM$reg.of.mir,sep='---') %in% paste(gene2gene2sign$reg,gene2gene2sign$tar,sep='---'),] colnames(FBL) = c('reg.of.mir','tar.of.mir','mir','sign.reg2mir','sign.mir2tar','sign.tar2reg') subtype = rep('++',nrow(FBL)) subtype[FBL$sign.reg2mir==1 & FBL$sign.tar2reg==-1]='+-' subtype[FBL$sign.reg2mir==-1 & FBL$sign.tar2reg==1]='-+' subtype[FBL$sign.reg2mir==-1 & FBL$sign.tar2reg==-1]='--' if (nrow(FBL)>0) { FBL = data.frame(FBL,subtype=paste('FBL',as.character(subtype),sep='.')) FBL.count = as.data.frame(table(FBL$subtype)) #FBL.count = data.frame(motif=FBL$subtype,count=FBL.count[,2]) } else FBL.count = data.frame(character(0),numeric(0)) # identify the four FFL.MM motifs. FFL.MM = merge(triple.MM,gene2gene2sign,by.x=c('reg.of.mir','tar.of.mir'),by.y=c('reg','tar')) colnames(FFL.MM) = c('reg.of.mir','tar.of.mir','mir','sign.reg2mir','sign.mir2tar','sign.reg2tar') subtype = rep('++',nrow(FFL.MM)) subtype[FFL.MM$sign.reg2mir==1 & FFL.MM$sign.reg2tar==-1]='+-' subtype[FFL.MM$sign.reg2mir==-1 & FFL.MM$sign.reg2tar==1]='-+' subtype[FFL.MM$sign.reg2mir==-1 & FFL.MM$sign.reg2tar==-1]='--' if (nrow(FFL.MM)>0) { FFL.MM = data.frame(FFL.MM,subtype=paste('FFL.MM',as.character(subtype),sep='.')) FFL.MM.count = as.data.frame(table(FFL.MM$subtype)) #FFL.MM.count = data.frame(motif=paste('FFL.MM',as.character(FFL.MM.count$subtype),sep='.'),count=FFL.MM.count[,2]) } else FFL.MM.count = data.frame(character(0),numeric(0)) rm('triple.MM') # identify the two FFL.MB motifs. triple.MB = merge(mir2gene2sign,mir2gene2sign,by.x='reg',by.y='reg') colnames(triple.MB) = c('mir','tarA.of.mir','sign.mir2tarA','tarB.of.mir','sign.mir2tarB') triple.MB = triple.MB[as.character(triple.MB$tarA.of.mir) != as.character(triple.MB$tarB.of.mir),] FFL.MB = merge(triple.MB,gene2gene2sign,by.x=c('tarA.of.mir','tarB.of.mir'),by.y=c('reg','tar')) colnames(FFL.MB) = c('tarA.of.mir','tarB.of.mir','mir','sign.mir2tarA','sign.mir2tarB','sign.tarA2tarB') subtype = rep('+',nrow(FFL.MB)) subtype[FFL.MB$sign.tarA2tarB==-1] = '-' if (nrow(FFL.MB)>0) { FFL.MB = data.frame(FFL.MB,subtype=paste('FFL.MB',as.character(subtype),sep='.')) FFL.MB.count = as.data.frame(table(FFL.MB$subtype)) #FFL.MB.count = data.frame(motif=paste('FFL.MB',as.character(FFL.MB.count$subtype),sep='.'),count=FFL.MB.count[,2]) } else FFL.MB.count = data.frame(character(0),numeric(0)) rm('triple.MB') # identify the six FFL.ME motifs. triple.ME = merge(gene2mir2sign,gene2mir2sign,by.x='tar',by.y='tar') colnames(triple.ME) = c('mir','regA.of.mir','sign.regA2mir','regB.of.mir','sign.regB2mir') triple.ME = triple.ME[as.character(triple.ME$regA.of.mir) != as.character(triple.ME$regB.of.mir),] FFL.ME = merge(triple.ME,gene2gene2sign,by.x=c('regA.of.mir','regB.of.mir'),by.y=c('reg','tar')) colnames(FFL.ME) = c('regA.of.mir','regB.of.mir','mir','sign.regA2mir','sign.regB2mir','sign.regA2regB') subtype = rep('+++',nrow(FFL.ME)) subtype[FFL.ME$sign.regA2mir==1 & FFL.ME$sign.regB2mir==1 & FFL.ME$sign.regA2regB==-1] = '++-' subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==1 & FFL.ME$sign.regA2regB==1] = '-++' subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==1 & FFL.ME$sign.regA2regB==-1] = '-+-' subtype[FFL.ME$sign.regA2mir==1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==1] = '+-+' subtype[FFL.ME$sign.regA2mir==1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==-1] = '+--' subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==1] = '--+' subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==-1] = '---' if (nrow(FFL.ME)>0) { FFL.ME = data.frame(FFL.ME,subtype=paste('FFL.ME',as.character(subtype),sep='.')) FFL.ME.count = as.data.frame(table(FFL.ME$subtype)) #FFL.ME.count = data.frame(motif=paste('FFL.ME',as.character(FFL.ME.count$subtype),sep='.'),count=FFL.ME.count[,2]) } else FFL.ME.count = data.frame(character(0),numeric(0)) rm('triple.ME') motif.instance = list(FBL=FBL,FFL.MM=FFL.MM,FFL.MB=FFL.MB,FFL.ME=FFL.ME) count = rbind(FBL.count, FFL.MM.count, FFL.MB.count,FFL.ME.count) motif.count = rep(0,18) names(motif.count) = c(paste('FBL',c('++','+-','-+','--'),sep='.'),paste('FFL.MB',c('+','-'),sep='.'),paste('FFL.MM',c('++','+-','-+','--'),sep='.'),paste('FFL.ME',c('+++','++-','-++','-+-','--+','---','+-+','+--'),sep='.')) motif.count[as.character(count$Var1)] <- count[,2] motif.matrix <- data.frame(gene1=character(0),gene2=character(0),mir=character(0),sign1=character(0),sign2=character(0),sign3=character(0),subtype=character(0)) for (i in 1:length(motif.instance)) { if (nrow(motif.instance[[i]])>0) { block <- motif.instance[[i]] colnames(block) <- c('gene1','gene2','mir','sign1','sign2','sign3','subtype') motif.matrix <- rbind(motif.matrix,block) } } motif.matrix <- motif.matrix[,c('subtype','gene1','gene2','mir','sign1','sign2','sign3')] colnames(motif.matrix)[1] <- 'motif' list(count=motif.count,matrix=motif.matrix,instance=motif.instance) } lasso.dr1<-function(x,y,kfold=5,rep=1,plot=F,grid=seq(from=0,to=1,length=100)){ # wrapper for lasso in lars, with modifications to cv.lars require(lars) # set.seed(seed) ok<-complete.cases(x,y) x<-x[ok,] # get rid of na's y<-y[ok] # since regsubsets can't handle na's m<-ncol(x) n<-nrow(x) as.matrix(x)->x cvout<-cv.lars1(x,y,K=kfold,plot.it=F,fraction=grid) sumcv<-cvout$cv frac<-cvout$fraction if(rep >1){ for(i in 1:(rep-1)){ sumcv<-sumcv+cv.lars1(x,y,K=kfold,plot.it=F,fraction=grid)$cv } # ends for } # ends if sumcv<-sumcv/rep if(plot) plot(frac,sumcv) sfrac<-frac[which.min(sumcv)] # assume frac stays the same object<-lars(x,y,type="lasso") fit <- predict.lars(object,x,s=sfrac,type="fit",mode="fraction")$fit coeff<-predict.lars(object,x,s=sfrac,type="coef",mode="fraction")$coefficients st<-sum(coeff !=0) # number nonzero mse<-sum((y-fit)^2)/(n-st-1) colnames(x)<-colnames(x,do.NULL=F,prefix="") # corrects for no colnames x<-x[,colnames(x)[which(coeff !=0)]] #cat("",fill=T) #cat("K (num. of folds)=",kfold,fill=T) #cat("Num. of reps)=",rep,fill=T) #cat("Fraction from CV=",sfrac,fill=T) #cat("number nonzero=",st,fill=T) #cat("mse=",mse,fill=T) #cat("Variables Selected =",colnames(x),fill=T) return(list(l.object=object,frac=frac,fit=fit,variables=colnames(x),coeff=coeff[which(coeff !=0)],st=st,mse=mse,sumcv=sumcv)) } cv.lars1<- function (x, y, K = 10, fraction = seq(from = 0, to = 1, length = 100), trace = FALSE, plot.it = TRUE, se = TRUE) { # Howard Bondell modification of cv.lars ### add this code full.fit <- lars(x, y, trace = trace) betas <- full.fit$beta sbetas <- scale(betas, FALSE, 1/full.fit$normx) kp <- dim(betas) k <- kp[1] p <- kp[2] norm.bound <- fraction*(abs(sbetas[k,]) %*% rep(1, p)) ### all.folds <- cv.folds(length(y), K) residmat <- matrix(0, length(fraction), K) for (i in seq(K)) { omit <- all.folds[[i]] fit <- lars(x[-omit, ], y[-omit], trace = trace) ### fit <- predict(fit, x[omit, , drop = FALSE], mode = "fraction", ### s = fraction)$fit ### Replace this line with fit <- predict.lars1(fit, x[omit, , drop = FALSE], mode = "norm", s = norm.bound)$fit ### if (length(omit) == 1) fit <- matrix(fit, nrow = 1) residmat[, i] <- apply((y[omit] - fit)^2, 2, mean) if (trace) cat("\n CV Fold", i, "\n\n") } cv <- apply(residmat, 1, mean) cv.error <- sqrt(apply(residmat, 1, var)/K) object <- list(fraction = fraction, cv = cv, cv.error = cv.error) if (plot.it) plotCVLars(object, se = se) invisible(object) } predict.lars1 <- function (object, newx, s, type = c("fit", "coefficients"), mode = c("step", "fraction", "norm")) { mode <- match.arg(mode) type <- match.arg(type) if (missing(newx) & type == "fit") { warning("Type=fit with no newx argument; type switched to coefficients") type <- "coefficients" } betas <- object$beta sbetas <- scale(betas, FALSE, 1/object$normx) kp <- dim(betas) k <- kp[1] p <- kp[2] steps <- seq(k) if (missing(s)) { s <- steps mode <- "step" } sbeta <- switch(mode, step = { if (any(s < 0) | any(s > k)) stop("Argument s out of range") steps }, fraction = { if (any(s > 1) | any(s < 0)) stop("Argument s out of range") nbeta <- drop(abs(sbetas) %*% rep(1, p)) nbeta/nbeta[k] ### This part is changed }, norm = { nbeta <- drop(abs(sbetas) %*% rep(1, p)) if (any(s < 0)) stop("Argument s out of range") nbeta }) s[s>nbeta[k]] <- nbeta[k] ### sfrac <- (s - sbeta[1])/(sbeta[k] - sbeta[1]) sbeta <- (sbeta - sbeta[1])/(sbeta[k] - sbeta[1]) usbeta <- unique(sbeta) useq <- match(usbeta, sbeta) sbeta <- sbeta[useq] betas <- betas[useq, ] coord <- approx(sbeta, seq(sbeta), sfrac)$y left <- floor(coord) right <- ceiling(coord) newbetas <- ((sbeta[right] - sfrac) * betas[left, , drop = FALSE] + (sfrac - sbeta[left]) * betas[right, , drop = FALSE])/(sbeta[right] - sbeta[left]) newbetas[left == right, ] <- betas[left[left == right], ] robject <- switch(type, coefficients = list(s = s, fraction = sfrac, mode = mode, coefficients = drop(newbetas)), fit = list(s = s, fraction = sfrac, mode = mode, fit = drop(scale(newx, object$meanx, FALSE) %*% t(newbetas)) + object$mu)) robject }