### source codes of manuscript "Combinatorial Network of Transcriptional Regulation and microRNA Regulation in Human Cancer"
### Author: Hui Yu (yuhui@scbit.org)
### We would like to thank Prof. Dennis D. Boos for publicizing the lars-based LASSO wrapper function lasso.dr1(). (http://www4.stat.ncsu.edu/~boos/var.select/lasso.wrapper.html)
###miRNAII.regression(miRNA.exp,gene.exp,tf2gene,tf2mir,mir2gene,p.value=0.05,step.do=F,prefilter=F,penalty=c('AIC','BIC')[1]) ): using stepwise linear regression or LASSO regression to sift forward-predicted three types of regulations according to parallel expression datasets. step.do=T/F: STEP/LASSO; prefilter=T/F: implement univarate regression prefiltration or not.
###vertex.edge.feature(reg2tar): calculates the vertex features (in degree, out degree, betweenness) and edge feature (betweenness).
###coregulating.pairs(reg2tar,p.cutoff=0.01): identifies the coordinating regulator pairs with significantly high portion of overlapped targets. p.cutoff: threshold of fisher's exact test p-value.
###motif.p(node2node2sign2type,n): identifies the significantly recurrent network motifs. n: times of network shufflings.
miRNAII.regression <- function(miRNA.exp,gene.exp,tf2gene,tf2mir,mir2gene,p.value=0.05,step.do=T,prefilter=T,penalty=c('AIC','BIC')[1]) {
# F:\K\webserver\miR-II Rs\tf.mir.regression.RData as working image.
# miRNA.exp: 195 * 60 data matrix, rownames ( "hsa-let-7a" "hsa-let-7b" "hsa-let-7c" "hsa-let-7d" "hsa-let-7e" ...); colname order of miRNA.exp and gene.exp SHOULD BE identical.
# gene.exp: 8388 * 59 data matrix, rownames ("15E1.2" "2'-PDE" "76P" "A2BP1" "AACS" ...); colname order of miRNA.exp and gene.exp SHOULD BE identical.
# tf2gene, tf2mir, mir2gene: two-column data matrices.
# step.do: allowing user to decide to use 'stepwise' linear regression or not.
# output:
#########tf2mir.out: (inferred in Module I) four columns: tf, mir, coef, pval
#########reg2gene.out: inferred in Module II, including tf2gene.out and mir2gene.out conceptually.
#
tf2mir <- unique(tf2mir)
mir2gene <- unique(mir2gene)
tf2gene <- unique(tf2gene)
# Module I: inferring regulations towards mirs.
mirs.as.target <- intersect(rownames(miRNA.exp),unique(tf2mir[,2]))
#tfs.as.regulator <- intersect(rownames(gene.exp),unique(tf2mir[,1]))
tf2mir.out <- NULL # to be expanded through rbinding data blocks.
cat('now MODULE I: inferring tf->mirs...\n')
cat.i = 0
questionable = character(0)
for (i in 1:length(mirs.as.target)) {
if ( (i*100/length(mirs.as.target))%/%10>cat.i ) {
cat.i = cat.i+1
cat(cat.i*10,'%', '\n')
}
mir.i <- mirs.as.target[i]
cat(mir.i,': \n')
tfs.to.i <- intersect(tf2mir[tf2mir[,2]==mir.i,1],rownames(gene.exp))
if (length(tfs.to.i)>=1) {
if (length(tfs.to.i)==1) {
d <- data.frame(mir<-miRNA.exp[mir.i,],gene.exp[tfs.to.i,])
# I'm not sure whether I should use t(gene.exp[tfs.to.i,])
colnames(d)<-c('mir',tfs.to.i)
} else {
d <- data.frame(mir<-miRNA.exp[mir.i,],t(gene.exp[tfs.to.i,])) # t(miRNA.exp[mir.i,]) is questionable. If so, the two branches can be combined.
colnames(d)<-c('mir',tfs.to.i)
}
if (prefilter) {
ix <- c(1)
c<-NULL
for (j in 2:ncol(d)) {
#cat('\t',j,' of ',ncol(d),' ')
d.j<-d[,c(1,j)]
#cat('extracting univariable step\n')
m.j<-glm(mir~.,data=d.j)
#cat('passed univariable step\n')
if (summary(m.j)$coef[2,4]
1) {
#m<-glm(mir~.,data=d[,ix])
#cat('multiple variable regression finished.\n')
if (step.do) {
m<-glm(mir~.,data=d[,ix])
if (penalty=='AIC') {
k<-2
} else {
k<-log(length(ncol(miRNA.exp))) # orginally from "k<-log(length(logfc))" in miR-I code.
}
sink('/dev/null')
m<-step(m,k=k,silent=T) # silent unuseful?
sink()
c<-summary(m)$coef
tfs.to.i <- setdiff(rownames(c),c('(Intercept)'))
coef.to.i <- c[tfs.to.i,1]
pval.to.i <- c[tfs.to.i,4]
tf2mir.out <- rbind(tf2mir.out,data.frame(regulator=tfs.to.i,coef=coef.to.i,pval=pval.to.i,target=mir.i))
} else {
#cat('las returned.\n')
#sink('/dev/null/')
#cat(colnames(d[-1]),'\n')
if (length(ix)==2) {
noise = rnorm(nrow(d),nrow(d),1)
d <- data.frame(d,noise)
ix = seq(3)
questionable = c(questionable,mir.i)
#object = lasso.dr1(d[,ix[-1]],d[,1])
}
object = lasso.dr1(d[,ix[-1]],d[,1])
#sink()
# cat('las returned.\n')
#save.image('debugging.RData')
if (length(object$variables)>0) {
tfs.to.i <- object$variables
#cat(tfs.to.i,'\n')
coef.to.i <- object$coeff
error.to.i <- object$mse
target <- mir.i
tf2mir.out <- rbind(tf2mir.out,data.frame(regulator=tfs.to.i,coef=coef.to.i,error=error.to.i,target=mir.i))
}
}
#cat('step finished.\n')
}
}
}
# inferring regulations towards genes.
genes.as.target <- intersect(rownames(gene.exp),union(unique(tf2gene[,2]),unique(mir2gene[,2])))
cat('now MODULE II: inferring regulators->genes...'); cat('\n')
reg2gene.out<-NULL
cat.i = 0
for (i in 1:length(genes.as.target)) {
gene.i <- genes.as.target[i]
tfs.to.i <- intersect(tf2gene[tf2gene[,2]==gene.i,1],rownames(gene.exp))
mirs.to.i <- intersect(mir2gene[mir2gene[,2]==gene.i,1],rownames(miRNA.exp))
#cat(i*100/length(genes.as.target),'% ',gene.i,' ',length(tfs.to.i)+length(mirs.to.i),'candidates: \n')
if (length(tfs.to.i)+length(mirs.to.i)>=1) {
if (length(tfs.to.i)+length(mirs.to.i)==1) {
if (length(tfs.to.i)==1) {
d <- d <- data.frame(gene<-gene.exp[gene.i,],gene.exp[tfs.to.i,])
} else {
d <- d <- data.frame(gene<-gene.exp[gene.i,],miRNA.exp[mirs.to.i,])
}
} else {
d <- data.frame(gene<-gene.exp[gene.i,],t(rbind(gene.exp[tfs.to.i,],miRNA.exp[mirs.to.i,])))
}
colnames(d)<-c('gene',tfs.to.i,mirs.to.i)
ix <- c(1)
if ( (i*100/length(genes.as.target))%/%5>cat.i ) {
cat.i = cat.i+1
cat(cat.i*5,'%', '\n')
}
#cat(i*100/length(genes.as.target),'% ')
#cat(i*100/length(genes.as.target),'% \n') #,gene.i,' ',length(tfs.to.i)+length(mirs.to.i),'candidates:\n ')
#cat(gene.i,': ')
if (prefilter) {
for (j in 2:ncol(d)) {
d.j<-d[,c(1,j)]
m.j<-glm(gene~.,data=d.j)
if (summary(m.j)$coef[2,4]1) {
m<-glm(gene~.,data=d[,ix])
if (step.do) {
m<-glm(gene~.,data=d[,ix])^M
if (penalty=='AIC') {
k<-2
} else {
k<-log(length(ncol(miRNA.exp))) # orginally from "k<-log(length(logfc))" in miR-I code.
}
sink('/dev/null')
m<-step(m,k=k,silent=T)
sink()
c<-summary(m)$coef^M
reg.to.i <- setdiff(rownames(c),c('(Intercept)'))^M
coef.to.i <- c[reg.to.i,1]^M
pval.to.i <- c[reg.to.i,4]^M
reg2gene.out <- rbind(reg2gene.out,data.frame(regulator=as.character(reg.to.i),coef=coef.to.i,pval=pval.to.i,target=as.character(gene.i)))^M
} else {
if (length(ix)==2) {
noise = rnorm(nrow(d),nrow(d),1)
d <- data.frame(d,noise)
ix = seq(3)
questionable = c(questionable,gene.i)
#object = lasso.dr1(d[,ix[-1]],d[,1])
}
object = lasso.dr1(d[,ix[-1]],d[,1])
if (length(object$variables)>0) {
reg.to.i <- object$variables
coef.to.i <- object$coeff
error.to.i <- object$mse
target <- mir.i
if (sum(grepl('hsa-',reg.to.i,fixed=T))>0) {
mir.pos <- grepl('hsa-',reg.to.i,fixed=T)
if (sum(coef.to.i[mir.pos]>0) ==0 )
reg2gene.out <- rbind(reg2gene.out,data.frame(regulator=as.character(reg.to.i),coef=coef.to.i,error=error.to.i,target=as.character(gene.i)))
} else
reg2gene.out <- rbind(reg2gene.out,data.frame(regulator=as.character(reg.to.i),coef=coef.to.i,error=error.to.i,target=as.character(gene.i)))
}
}
}
}
}
rownames(reg2gene.out)<-NULL
#cat('out of two modules\n')
#reg2tar.out <- rbind(tf2mir.out,reg2gene.out)
#colnames(reg2tar.out)<- c('regulator','coefficient','p.value','target')
write.table(rbind(tf2mir.out,reg2gene.out),'reg2tar.filtered.txt',sep='\t',quote=F,row.names=F)
cat('regression finished!\n')
reg2tar.res <- list(tf2mir=tf2mir.out,reg2gene=reg2gene.out)
matrix.reg2tar.res <- function(reg2tar.res) {
type.tf2mir = rep('tf2mir',nrow(reg2tar.res$tf2mir))
types = rep('tf2gene',nrow(reg2tar.res$reg2gene))
attach(reg2tar.res$reg2gene)
types[grepl('hsa-',regulator,fixed=T)] <- 'mir2gene' #need an exact parameter
detach(reg2tar.res$reg2gene)
type = c(type.tf2mir,types)
reg2tar.out <- rbind(reg2tar.res$tf2mir,reg2tar.res$reg2gene)
reg2tar.out <- data.frame(reg2tar.out,type=type)
reg2tar.out = reg2tar.out[,c('type','regulator','target','coef','error')]
}
write.table(questionable,'questionable.txt',quote=F)
reg2tar.out <- matrix.reg2tar.res(reg2tar.res)
}
######################################################################################################################
################################vertex.edge.feature (with powerlaw)###################################################
######################################################################################################################
vertex.edge.feature <- function(reg2tar) {
# input 'reg2tar': two columns of regulators and targets (three types tf2mir,tf2gene, and mir2gene).
# outputs a list of two components:
# v.properties: vertices, in.degree, out.degree, betweenness
# e.properties: e.v1,e.v2,betweenness.
if (require(igraph)) {
#reg2tar <- rbind(reg2tar$tf2mir,reg2tar$reg2gene)
#reg2tar <- reg2tar[,c(1,4)]
#g<-graph.empty()
#g<-add.edges(g,reg2tar)
g <- make.graph(t(reg2tar))
vertices = V(g)$name
v.in.degree = degree(g,mode='in')
v.out.degree = degree(g,mode='out')
#power.law(v.out.degree,'power law.tff')
v.betweenness = betweenness(g)
e.betweenness = edge.betweenness(g)
v.properties = data.frame(vertices=vertices,in.degree=v.in.degree,out.degree=v.out.degree,betweenness=v.betweenness)
edges = get.edgelist(g)
type = rep('tf2gene',nrow(edges))
type[grepl('hsa-',edges[,1],fixed=T)] <- 'mir2gene'
type[grepl('hsa-',edges[,2],fixed=T)] <- 'tf2mir' #need an exact parameter
e.properties = data.frame(type=type,e.v1=edges[,1],e.v2=edges[,2],betweenness=e.betweenness)
#write.table(v.properties,'v.properties.txt',row.names=F,sep='\t',quote=F)
#write.table(e.properties,'e.properties.txt',row.names=F,sep='\t',quote=F)
propoerty = list(vertex=v.properties,edge=e.properties)
}
}
power.law <- function(k,fig='power law.tiff') {
logk=log2(k);logk[-logk==Inf]=0
breaks = min(logk)+0:20*((max(logk)-min(logk))/20)
hst = hist(logk,breaks=breaks)
logk=(breaks[1:20]+breaks[2:21])/2
logpk=log2(hst$counts); logpk[-logpk==Inf]=0
m=glm(y~x,data=data.frame(y=logpk,x=logk))
equation = paste('y = ',substr(coefficients(m)['(Intercept)'],1,5),' + ',substr(coefficients(m)['x'],1,4),'x',sep='')
corr = cor(x=logk,y=logpk,method='pearson')
main.txt=paste(equation,' r=',substr(corr,1,5),' 20 points')
#y=predict(m,newdata=data.frame(logk))
#cat(logpk)
tiff(fig,width=1028,height=768)
plot(logk,logpk,xlim=c(min(logk)-1,max(logk)+1),ylim=c(min(logpk)-1,max(logpk)+1),pch=16,xlab='logrithm of degree at base 2',ylab='logrithm of number of nodes',main=main.txt)
#lines(logk,fitted(y))
dev.off()
}
#make.graph <- function (edges) {
# nodes<-unique(as.character(edges))
# nodes.i<-1:length(nodes)-1; names(nodes.i)<-nodes
# edges.i<-nodes.i[edges]
# g<-graph(edges.i,directed=T)
# V(g)$name<-nodes
# g
#};
######################################################################################################################
################################coregulating.pairs####################################################################
######################################################################################################################
coregulating.pairs <- function(reg2tar,p.cutoff=0.01) {
#input: reg2tar: two columns of regulators and targets.
#output: a matrix of information on coregulating pairs.
# check if self-regulations exist. if yes, run out with a warning.
g <- make.graph(t(reg2tar))
colnames(reg2tar)=c('reg','tar')
node.deg = data.frame(node=V(g)$name,deg=degree(g, mode='out'))
reg1.reg2.tar = merge(reg2tar,reg2tar,by.x='tar',by.y='tar')
reg1.reg2.tar = reg1.reg2.tar[,c(2,3,1)]
colnames(reg1.reg2.tar) = c('reg1','reg2','coTar')
reg1.reg2.tar=reg1.reg2.tar[as.character(reg1.reg2.tar[,1])!=as.character(reg1.reg2.tar[,2]),] # after this step, each binary tuple is still repeated with reversed order.
reg1.reg2.coTarCounts = as.data.frame(table(reg1.reg2.tar[1:2]))
reg1.reg2.coTarCounts = reg1.reg2.coTarCounts[reg1.reg2.coTarCounts[,3]!=0,]
colnames(reg1.reg2.coTarCounts)= c('reg1','reg2','coTarCounts')
count.table.deg1 = merge(reg1.reg2.coTarCounts,node.deg,by.x='reg1',by.y='node')
count.table.deg12 = merge(count.table.deg1,node.deg,by.x='reg2',by.y='node')
count.table.deg12 = count.table.deg12[,c(2,1,3,4,5)]
colnames(count.table.deg12) = c('reg1','reg2','coCount','count1','count2')
reg2tar.logic = data.frame(reg2tar,edge=1)
count.table.deg12 = merge(count.table.deg12,reg2tar.logic,by.y=c('reg','tar'),by.x=c('reg1','reg2'),all.x=T)
colnames(count.table.deg12)[6] = 'reg1.to.reg2'
count.table.deg12 = merge(count.table.deg12,reg2tar.logic,by.y=c('reg','tar'),by.x=c('reg2','reg1'),all.x=T)
colnames(count.table.deg12)[7] = 'reg2.to.reg1'
#colnames(count.table.deg12)[3:4] = c('reg1reg2','reg2reg1')
count.table.deg12 = count.table.deg12[,c(2,1,3:7)]
count.table.deg12[is.na(count.table.deg12)] = 0
a = count.table.deg12$coCount
b = count.table.deg12$count1-a-count.table.deg12$reg1.to.reg2
c = count.table.deg12$count2-a-count.table.deg12$reg2.to.reg1
d = vcount(g)-2-(a+b+c)
rowwise.fisher = function(x) {
fisher.test(matrix(x,nc=2,byrow=T),alternative='g')$p.value
} # a coregulating pair where A->B or A<-B needs to be specially considered.
fisher.pval = apply(data.frame(a,b,c,d),1,rowwise.fisher)
m = data.frame(count.table.deg12,fisher.pval=fisher.pval)
m = m[m$fisher.pval<=p.cutoff,]
type = rep('tf.tf',times=nrow(m))
type[grepl('hsa-',m$reg1,fixed=T) | grepl('hsa-',m$reg2,fixed=T)]='tf.mir'
type[grepl('hsa-',m$reg1,fixed=T) & grepl('hsa-',m$reg2,fixed=T)]='mir.mir'
out = data.frame(type=type,m)
ix = order(out$fisher.pval,out$type,out$coCount,out$count1+out$count2)
out = out[ix,]
ix.half = seq(from=2,by=2,to=nrow(out))
out = out[ix.half,]
ix = order(out$type,out$fisher.pval)
out = out[ix,]
}
make.graph <- function (edges) {
if (require(igraph)) {
nodes<-unique(as.character(edges))
nodes.i<-1:length(nodes)-1; names(nodes.i)<-nodes
edges.i<-nodes.i[edges]
g<-graph(edges.i,directed=T)
V(g)$name<-nodes
g
}
};
motif.p <- function(node2node2sign2type,n) {
# input: node2node2sign2type,n
# output: a matrix with three columns: motif names, counts, and p.
p <- rep(1,18)
count.s <- matrix(0,nr=n,nc=18)
names(p) <- colnames(count.s) <- c(paste('FBL',c('++','+-','-+','--'),sep='.'),paste('FFL.MB',c('+','-'),sep='.'),paste('FFL.MM',c('++','+-','-+','--'),sep='.'),paste('FFL.ME',c('+++','++-','-++','-+-','--+','---','+-+','+--'),sep='.'))
#cat(names(p),'\n')
for (i in 1:n) {
cat('network shuffling ',i*100/n,'%\n')
g.s <- combinat.network.random(node2node2sign2type)
e.s <- get.edgelist(g.s)
sign <- E(g.s)$sign
node2node2sign.s <- data.frame(e.s,sign)
res <- counting.motifs(node2node2sign.s)
#cat(res$count,'\n')
count.s[i,] <- res$count
}
res.motif <- counting.motifs(node2node2sign2type[,1:3])
count <- res.motif$count
for (i in 1:18) {
p[i] <- sum(count.s[,i]>=count[i])/n
}
data.frame(motif=names(p),count=count,p=p)
}
combinat.network.random <- function(node2node2sign2type) {
node2node2sign = node2node2sign2type[,1:3]
type = as.character(node2node2sign2type[,4])
type.unique = unique(type)
e.s = matrix(nr=0,nc=2); sign = character(0)
for (i in 1:length(type.unique)) {
node2node2sign.i = node2node2sign[type==type.unique[i],]
#cat('before a type.\n')
g.s.i = type.network.random(node2node2sign.i)
#cat('after a type.\n')
e.s.i = get.edgelist(g.s.i)
sign.i = E(g.s.i)$sign
e.s = rbind(e.s,e.s.i)
sign = c(sign,sign.i)
}
#cat('e.s have seemingly been generated.\n')
g.s = graph.data.frame(e.s,directed=T)
E(g.s)$sign = sign
E(g.s)$type = type
g.s
}
type.network.random <- function(node2node2sign) {
if (require(igraph)) {
g <- graph.data.frame(node2node2sign[,1:2], directed=T)
E(g)$sign = node2node2sign[,3]
d <- sapply(c('out','in'), function(mode) {degree(graph=g, mode=mode)})
#set.seed(1) # Is this sentence necessary? Why is it necessary?
g.s <- simplify(degree.sequence.game(out.deg=d[,'out'], in.deg=d[,'in']))
V(g.s)$name = V(g)$name
d.s <- sapply(c('out','in'), function(mode) {degree(graph=g.s, mode=mode)})
d.delta <- d - d.s
e.s = get.edgelist(g.s)
#cat('before the iteration\n')
rep = 0
while ( sum(d.delta[,1]!=0)>1 | sum(d.delta[,2]!=0)>1 ) {
rep = rep+1
#cat('enter repetitive network shuffling times:',rep,'\n')
# ix <- which(d.delta[,1] != 0 | d.delta[,2] != 0)
g.s.2 <- simplify(degree.sequence.game(out.deg=d.delta[,'out'], in.deg=d.delta[,'in']))
V(g.s.2)$name = V(g)$name
d.s.2 <- sapply(c('out','in'), function(mode) {degree(graph=g.s.2, mode=mode)})
d.s = d.s + d.s.2
d.delta = d - d.s
e.s.2 = get.edgelist(g.s.2)
e.s = rbind(e.s,e.s.2)
#cat('in.delta ',sum(d.delta[,1]!=0),' out.delta ',sum(d.delta[,2]!=0),'\n')
#g.s = graph(t(e.s,directed=T))
#d.s <- sapply(c('out','in'), function(mode) {degree(graph=g.s, mode=mode)})
}
#cat('after the iteration\n')
g.s = graph.data.frame(e.s,directed=T)
V(g.s)$name = V(g)$name
E(g.s)$sign = E(g)$sign
g.s
} else {
cat('Sorry, you must install igraph first.\n')
}
}
# suppose 'sign' is indicated by {1,-1}
# input: node2node2sign: three columns: node1,node2,sign.
# output: list(cout, instance)
####### count: a vector of 18 elements, with names specifying all possible 18 three-vertexed motifs.
####### instance: all instances of each subtype motif.
counting.motifs <- function(node2node2sign) {
# output: motif count and motif instances, two componenets of a list.
colnames(node2node2sign) = c('reg','tar','sign')
gene2gene2sign = node2node2sign[!grepl('hsa-',node2node2sign$reg,fixed=T) & !grepl('hsa-',node2node2sign$tar,fixed=T),]
mir2gene2sign = node2node2sign[grepl('hsa-',node2node2sign$reg,fixed=T) & !grepl('hsa-',node2node2sign$tar,fixed=T),]
gene2mir2sign = node2node2sign[!grepl('hsa-',node2node2sign$reg,fixed=T) & grepl('hsa-',node2node2sign$tar,fixed=T),]
# identify the four FBL motifs.
triple.MM = merge(gene2mir2sign,mir2gene2sign,by.x='tar',by.y='reg')
colnames(triple.MM) = c('mir','reg.of.mir','sign.reg2mir','tar.of.mir','sign.mir2tar')
triple.MM = triple.MM[as.character(triple.MM$reg.of.mir) != as.character(triple.MM$tar.of.mir),]
FBL = merge(triple.MM,gene2gene2sign,by.x=c('reg.of.mir','tar.of.mir'),by.y=c('tar','reg'))
#FBL = triple.MM[ paste(triple.MM$tar.of.mir,triple.MM$reg.of.mir,sep='---') %in% paste(gene2gene2sign$reg,gene2gene2sign$tar,sep='---'),]
colnames(FBL) = c('reg.of.mir','tar.of.mir','mir','sign.reg2mir','sign.mir2tar','sign.tar2reg')
subtype = rep('++',nrow(FBL))
subtype[FBL$sign.reg2mir==1 & FBL$sign.tar2reg==-1]='+-'
subtype[FBL$sign.reg2mir==-1 & FBL$sign.tar2reg==1]='-+'
subtype[FBL$sign.reg2mir==-1 & FBL$sign.tar2reg==-1]='--'
if (nrow(FBL)>0) {
FBL = data.frame(FBL,subtype=paste('FBL',as.character(subtype),sep='.'))
FBL.count = as.data.frame(table(FBL$subtype))
#FBL.count = data.frame(motif=FBL$subtype,count=FBL.count[,2])
} else FBL.count = data.frame(character(0),numeric(0))
# identify the four FFL.MM motifs.
FFL.MM = merge(triple.MM,gene2gene2sign,by.x=c('reg.of.mir','tar.of.mir'),by.y=c('reg','tar'))
colnames(FFL.MM) = c('reg.of.mir','tar.of.mir','mir','sign.reg2mir','sign.mir2tar','sign.reg2tar')
subtype = rep('++',nrow(FFL.MM))
subtype[FFL.MM$sign.reg2mir==1 & FFL.MM$sign.reg2tar==-1]='+-'
subtype[FFL.MM$sign.reg2mir==-1 & FFL.MM$sign.reg2tar==1]='-+'
subtype[FFL.MM$sign.reg2mir==-1 & FFL.MM$sign.reg2tar==-1]='--'
if (nrow(FFL.MM)>0) {
FFL.MM = data.frame(FFL.MM,subtype=paste('FFL.MM',as.character(subtype),sep='.'))
FFL.MM.count = as.data.frame(table(FFL.MM$subtype))
#FFL.MM.count = data.frame(motif=paste('FFL.MM',as.character(FFL.MM.count$subtype),sep='.'),count=FFL.MM.count[,2])
} else FFL.MM.count = data.frame(character(0),numeric(0))
rm('triple.MM')
# identify the two FFL.MB motifs.
triple.MB = merge(mir2gene2sign,mir2gene2sign,by.x='reg',by.y='reg')
colnames(triple.MB) = c('mir','tarA.of.mir','sign.mir2tarA','tarB.of.mir','sign.mir2tarB')
triple.MB = triple.MB[as.character(triple.MB$tarA.of.mir) != as.character(triple.MB$tarB.of.mir),]
FFL.MB = merge(triple.MB,gene2gene2sign,by.x=c('tarA.of.mir','tarB.of.mir'),by.y=c('reg','tar'))
colnames(FFL.MB) = c('tarA.of.mir','tarB.of.mir','mir','sign.mir2tarA','sign.mir2tarB','sign.tarA2tarB')
subtype = rep('+',nrow(FFL.MB))
subtype[FFL.MB$sign.tarA2tarB==-1] = '-'
if (nrow(FFL.MB)>0) {
FFL.MB = data.frame(FFL.MB,subtype=paste('FFL.MB',as.character(subtype),sep='.'))
FFL.MB.count = as.data.frame(table(FFL.MB$subtype))
#FFL.MB.count = data.frame(motif=paste('FFL.MB',as.character(FFL.MB.count$subtype),sep='.'),count=FFL.MB.count[,2])
} else FFL.MB.count = data.frame(character(0),numeric(0))
rm('triple.MB')
# identify the six FFL.ME motifs.
triple.ME = merge(gene2mir2sign,gene2mir2sign,by.x='tar',by.y='tar')
colnames(triple.ME) = c('mir','regA.of.mir','sign.regA2mir','regB.of.mir','sign.regB2mir')
triple.ME = triple.ME[as.character(triple.ME$regA.of.mir) != as.character(triple.ME$regB.of.mir),]
FFL.ME = merge(triple.ME,gene2gene2sign,by.x=c('regA.of.mir','regB.of.mir'),by.y=c('reg','tar'))
colnames(FFL.ME) = c('regA.of.mir','regB.of.mir','mir','sign.regA2mir','sign.regB2mir','sign.regA2regB')
subtype = rep('+++',nrow(FFL.ME))
subtype[FFL.ME$sign.regA2mir==1 & FFL.ME$sign.regB2mir==1 & FFL.ME$sign.regA2regB==-1] = '++-'
subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==1 & FFL.ME$sign.regA2regB==1] = '-++'
subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==1 & FFL.ME$sign.regA2regB==-1] = '-+-'
subtype[FFL.ME$sign.regA2mir==1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==1] = '+-+'
subtype[FFL.ME$sign.regA2mir==1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==-1] = '+--'
subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==1] = '--+'
subtype[FFL.ME$sign.regA2mir==-1 & FFL.ME$sign.regB2mir==-1 & FFL.ME$sign.regA2regB==-1] = '---'
if (nrow(FFL.ME)>0) {
FFL.ME = data.frame(FFL.ME,subtype=paste('FFL.ME',as.character(subtype),sep='.'))
FFL.ME.count = as.data.frame(table(FFL.ME$subtype))
#FFL.ME.count = data.frame(motif=paste('FFL.ME',as.character(FFL.ME.count$subtype),sep='.'),count=FFL.ME.count[,2])
} else FFL.ME.count = data.frame(character(0),numeric(0))
rm('triple.ME')
motif.instance = list(FBL=FBL,FFL.MM=FFL.MM,FFL.MB=FFL.MB,FFL.ME=FFL.ME)
count = rbind(FBL.count, FFL.MM.count, FFL.MB.count,FFL.ME.count)
motif.count = rep(0,18)
names(motif.count) = c(paste('FBL',c('++','+-','-+','--'),sep='.'),paste('FFL.MB',c('+','-'),sep='.'),paste('FFL.MM',c('++','+-','-+','--'),sep='.'),paste('FFL.ME',c('+++','++-','-++','-+-','--+','---','+-+','+--'),sep='.'))
motif.count[as.character(count$Var1)] <- count[,2]
motif.matrix <- data.frame(gene1=character(0),gene2=character(0),mir=character(0),sign1=character(0),sign2=character(0),sign3=character(0),subtype=character(0))
for (i in 1:length(motif.instance)) {
if (nrow(motif.instance[[i]])>0) {
block <- motif.instance[[i]]
colnames(block) <- c('gene1','gene2','mir','sign1','sign2','sign3','subtype')
motif.matrix <- rbind(motif.matrix,block)
}
}
motif.matrix <- motif.matrix[,c('subtype','gene1','gene2','mir','sign1','sign2','sign3')]
colnames(motif.matrix)[1] <- 'motif'
list(count=motif.count,matrix=motif.matrix,instance=motif.instance)
}
lasso.dr1<-function(x,y,kfold=5,rep=1,plot=F,grid=seq(from=0,to=1,length=100)){
# wrapper for lasso in lars, with modifications to cv.lars
require(lars)
# set.seed(seed)
ok<-complete.cases(x,y)
x<-x[ok,] # get rid of na's
y<-y[ok] # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
as.matrix(x)->x
cvout<-cv.lars1(x,y,K=kfold,plot.it=F,fraction=grid)
sumcv<-cvout$cv
frac<-cvout$fraction
if(rep >1){
for(i in 1:(rep-1)){
sumcv<-sumcv+cv.lars1(x,y,K=kfold,plot.it=F,fraction=grid)$cv
} # ends for
} # ends if
sumcv<-sumcv/rep
if(plot) plot(frac,sumcv)
sfrac<-frac[which.min(sumcv)] # assume frac stays the same
object<-lars(x,y,type="lasso")
fit <- predict.lars(object,x,s=sfrac,type="fit",mode="fraction")$fit
coeff<-predict.lars(object,x,s=sfrac,type="coef",mode="fraction")$coefficients
st<-sum(coeff !=0) # number nonzero
mse<-sum((y-fit)^2)/(n-st-1)
colnames(x)<-colnames(x,do.NULL=F,prefix="") # corrects for no colnames
x<-x[,colnames(x)[which(coeff !=0)]]
#cat("",fill=T)
#cat("K (num. of folds)=",kfold,fill=T)
#cat("Num. of reps)=",rep,fill=T)
#cat("Fraction from CV=",sfrac,fill=T)
#cat("number nonzero=",st,fill=T)
#cat("mse=",mse,fill=T)
#cat("Variables Selected =",colnames(x),fill=T)
return(list(l.object=object,frac=frac,fit=fit,variables=colnames(x),coeff=coeff[which(coeff !=0)],st=st,mse=mse,sumcv=sumcv))
}
cv.lars1<- function (x, y, K = 10, fraction = seq(from = 0, to = 1, length = 100),
trace = FALSE, plot.it = TRUE, se = TRUE)
{
# Howard Bondell modification of cv.lars
### add this code
full.fit <- lars(x, y, trace = trace)
betas <- full.fit$beta
sbetas <- scale(betas, FALSE, 1/full.fit$normx)
kp <- dim(betas)
k <- kp[1]
p <- kp[2]
norm.bound <- fraction*(abs(sbetas[k,]) %*% rep(1, p))
###
all.folds <- cv.folds(length(y), K)
residmat <- matrix(0, length(fraction), K)
for (i in seq(K)) {
omit <- all.folds[[i]]
fit <- lars(x[-omit, ], y[-omit], trace = trace)
### fit <- predict(fit, x[omit, , drop = FALSE], mode = "fraction",
### s = fraction)$fit
### Replace this line with
fit <- predict.lars1(fit, x[omit, , drop = FALSE], mode = "norm",
s = norm.bound)$fit
###
if (length(omit) == 1)
fit <- matrix(fit, nrow = 1)
residmat[, i] <- apply((y[omit] - fit)^2, 2, mean)
if (trace)
cat("\n CV Fold", i, "\n\n")
}
cv <- apply(residmat, 1, mean)
cv.error <- sqrt(apply(residmat, 1, var)/K)
object <- list(fraction = fraction, cv = cv, cv.error = cv.error)
if (plot.it)
plotCVLars(object, se = se)
invisible(object)
}
predict.lars1 <- function (object, newx, s, type = c("fit", "coefficients"), mode = c("step",
"fraction", "norm"))
{
mode <- match.arg(mode)
type <- match.arg(type)
if (missing(newx) & type == "fit") {
warning("Type=fit with no newx argument; type switched to coefficients")
type <- "coefficients"
}
betas <- object$beta
sbetas <- scale(betas, FALSE, 1/object$normx)
kp <- dim(betas)
k <- kp[1]
p <- kp[2]
steps <- seq(k)
if (missing(s)) {
s <- steps
mode <- "step"
}
sbeta <- switch(mode, step = {
if (any(s < 0) | any(s > k))
stop("Argument s out of range")
steps
}, fraction = {
if (any(s > 1) | any(s < 0))
stop("Argument s out of range")
nbeta <- drop(abs(sbetas) %*% rep(1, p))
nbeta/nbeta[k]
### This part is changed
}, norm = {
nbeta <- drop(abs(sbetas) %*% rep(1, p))
if (any(s < 0))
stop("Argument s out of range")
nbeta
})
s[s>nbeta[k]] <- nbeta[k]
###
sfrac <- (s - sbeta[1])/(sbeta[k] - sbeta[1])
sbeta <- (sbeta - sbeta[1])/(sbeta[k] - sbeta[1])
usbeta <- unique(sbeta)
useq <- match(usbeta, sbeta)
sbeta <- sbeta[useq]
betas <- betas[useq, ]
coord <- approx(sbeta, seq(sbeta), sfrac)$y
left <- floor(coord)
right <- ceiling(coord)
newbetas <- ((sbeta[right] - sfrac) * betas[left, , drop = FALSE] +
(sfrac - sbeta[left]) * betas[right, , drop = FALSE])/(sbeta[right] -
sbeta[left])
newbetas[left == right, ] <- betas[left[left == right], ]
robject <- switch(type, coefficients = list(s = s, fraction = sfrac,
mode = mode, coefficients = drop(newbetas)), fit = list(s = s,
fraction = sfrac, mode = mode, fit = drop(scale(newx,
object$meanx, FALSE) %*% t(newbetas)) + object$mu))
robject
}