服务热线
178 0020 3020
setwd("D:/TCGA/BRCA/")
mRNA= read.csv("mRNA/mRNA_TPM.csv", header = T,row.names = 1)
#> dim(expertSet)
#[1] 3659 514
sample = colnames(mRNA)
data_trait = data.frame(
"id"=sample,
"normal"=0,
"tumor"=0,
"stage_early"=0,
"stage_later"=0
)
for(i in 1:length(sample)){
if(as.numeric(substr(sample[i],14,15))<11){
data_trait[i,"tumor"]=1
}else{
data_trait[i,"normal"]=1
}
}
#获得data属性
clinical = read.csv("clinical.csv", header = T)
for(i in 1:length(sample)){
if(as.numeric(substr(sample[i],14,15))<11){
clin = clinical[gsub("-","\\.",clinical$submitter_id)==substr(sample[i],1,12),]
if(nrow(clin)!=0){
if(clin[1,'tumor_stage']=='stage i' | clin[1,'tumor_stage']=='stage ia'){
data_trait[i,'stage_early']=1
}else if(clin[1,'tumor_stage']=='stage ii' | clin[1,'tumor_stage']=='stage iia' | clin[1,'tumor_stage']=='stage iib' | clin[1,'tumor_stage']=='stage iic'){
data_trait[i,'stage_early']=1
}
else if(clin[1,'tumor_stage']=='stage iii' | clin[1,'tumor_stage']=='stage iiia' | clin[1,'tumor_stage']=='stage iiib' | clin[1,'tumor_stage']=='stage iiic'){
data_trait[i,'stage_later']=1
}else if(clin[1,'tumor_stage']=='stage iv' | clin[1,'tumor_stage']=='stage iva' | clin[1,'tumor_stage']=='stage ivb'){
data_trait[i,'stage_later']=1
}else if(clin[1,'tumor_stage']=='stage x'){
data_trait[i,'stage_later']=1
}
}
}
}
row.names(data_trait)=data_trait$id
data_trait = data_trait[,-1]
eng2symbol = read.csv("eng2symbol.csv", header = T)
for(i in 1:nrow(eng2symbol)){
if(eng2symbol[i,2]=="" || is.na(eng2symbol[i,2])){
eng2symbol[i,2]=eng2symbol[i,2]
}
}
mRNA2 = mRNA[,row.names(data_trait)]
mRNA2$ensembl_gene_id = substr(row.names(mRNA2),1,15)
mRNA2 = merge(mRNA2, eng2symbol, by="ensembl_gene_id",all.x = T )
dim(mRNA2)
#60486 1224
mRNA2 = mRNA2[!duplicated(mRNA2$hgnc_symbol) & !is.na(mRNA2$hgnc_symbol),]
dim(mRNA2)
# 37146 1224
row.names(mRNA2)=mRNA2$hgnc_symbol
mRNA2 = mRNA2[,-c(1,ncol(mRNA2))]
dim(mRNA2)
#37146 1222
#移除median<1的
mRNA2$median = apply(mRNA2, 1, median)
mRNA2 = mRNA2[mRNA2$median>1,]
dim(mRNA2)
#15298 1223
mRNA2 = mRNA2[,-ncol(mRNA2)]
dim(mRNA2)
m.mad <- apply(mRNA2,1,mad)
dataExprVar <- mRNA2[which(m.mad >max(quantile(m.mad, probs=seq(0, 1, 0.25))[3],0.01)),]
dim(dataExprVar)
# 3825 1222
library("WGCNA")
datExpr0 = as.matrix(dataExprVar)
dim(datExpr0)
#3049 514
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))
# Remove the offending genes and samples from the data:
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
datExpr0 = t(datExpr0)
###############################Sample cluster##########样品聚类####################
sampleTree = hclust(dist(datExpr0), method = "average")
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
pdf(file = "mRNA/wgcna/1_sampleClustering.pdf", width = 12, height = 9)
par(cex = 0.6)
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
### Plot a line to show the cut
abline(h =100000, col = "red")##剪切高度不确定,故无红线
dev.off()
##############所以这一步不一定能够做,剪切高度问题,这个根据实际设置后可用
### Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 100000, minSize = 10)
table(clust)
### clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr0 = datExpr0[keepSamples, ]
dim(datExpr0)
#1207 3825
############### 载入性状数据## input trait data###############
#Loading clinical trait data
#traitData = read.table("trait_D.txt",row.names=1,header=T,comment.char = "",check.names=F)########trait file name can be changed######性状数据文件名,根据实际修改,如果工作路径不是实际性状数据路径,需要添加正确的数据路径
#dim(traitData)
#names(traitData)
# remove columns that hold information we do not need.
#allTraits = traitData
#dim(allTraits)
#names(allTraits)
allTraits = data_trait
# Form a data frame analogous to expression data that will hold the clinical traits.
fpkmSamples = rownames(datExpr0)
traitSamples =rownames(allTraits)
traitRows = match(fpkmSamples, traitSamples)
datTraits = allTraits[traitRows,]
rownames(datTraits)
collectGarbage()
# Re-cluster samples
sampleTree2 = hclust(dist(datExpr0), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE)
# Plot the sample dendrogram and the colors underneath.
#sizeGrWindow(12,12)
pdf(file="mRNA/wgcna/2_Sample dendrogram and trait heatmap.pdf",width=12,height=12)
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
dev.off()
##################################
#############################network constr########################################
# Allow multi-threading within WGCNA. At present this call is necessary.
# Any error here may be ignored but you may want to update WGCNA if you see one.
# Caution: skip this line if you run RStudio or other third-party R environments.
# See note above.
enableWGCNAThreads()
options(stringsAsFactors = FALSE)
# Choose a set of soft-thresholding powers
powers = c(1:30)
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr0, powerVector = powers, verbose = 5)
# Plot the results:
#sizeGrWindow(9, 5)
pdf(file="mRNA/wgcna/3_Scale independence.pdf",width=9,height=5)
par(mfrow = c(1,2))
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
######chose the softPower
softPower =sft$powerEstimate
# 4
adjacency = adjacency(datExpr0, power = softPower)
##### Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
#sizeGrWindow(12,9)
pdf(file="mRNA/wgcna/4_Gene clustering on TOM-based dissimilarity.pdf",width=12,height=9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
dev.off()
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 100
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
#sizeGrWindow(8,6)
pdf(file="mRNA/wgcna/5_Dynamic Tree Cut.pdf",width=8,height=6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
dev.off()
# Calculate eigengenes
MEList = moduleEigengenes(datExpr0, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")
# Plot the result
#sizeGrWindow(7, 6)
pdf(file="mRNA/wgcna/6_Clustering of module eigengenes.pdf",width=7,height=6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.25######剪切高度可修改
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
dev.off()
# Call an automatic merging function
merge = mergeCloseModules(datExpr0, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
#sizeGrWindow(12, 9)
pdf(file="mRNA/wgcna/7_merged dynamic.pdf", width = 9, height = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs
# Save module colors and labels for use in subsequent parts
#save(MEs, TOM, dissTOM, moduleLabels, moduleColors, geneTree, sft, file = "mRNA/wgcna_tumor/networkConstruction-stepByStep.RData")
##############################relate modules to external clinical triats######################################
# Define numbers of genes and samples
nGenes = ncol(datExpr0)
nSamples = nrow(datExpr0)
moduleTraitCor = cor(MEs, datTraits, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#sizeGrWindow(10,6)
pdf(file="mRNA/wgcna/8_Module-trait relationships.pdf",width=8,height=8)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 1,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
######## Define variable weight containing all column of datTraits
###MM and GS
# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr0, MEs, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
#names of those trait
traitNames=names(datTraits)
geneTraitSignificance = as.data.frame(cor(datExpr0, datTraits, use = "p"))
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", traitNames, sep="")
names(GSPvalue) = paste("p.GS.", traitNames, sep="")
####plot MM vs GS for each trait vs each module
##########example:royalblue and CK
#module="royalblue"
#column = match(module, modNames)
#moduleGenes = moduleColors==module
#trait="CK"
#traitColumn=match(trait,traitNames)
#sizeGrWindow(7, 7)
#par(mfrow = c(1,1))
#verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
#abs(geneTraitSignificance[moduleGenes, traitColumn]),
#xlab = paste("Module Membership in", module, "module"),
#ylab = paste("Gene significance for ",trait),
#main = paste("Module membership vs. gene significance\n"),
#cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
######
for (trait in traitNames){
traitColumn=match(trait,traitNames)
for (module in modNames){
column = match(module, modNames)
moduleGenes = moduleColors==module
if (nrow(geneModuleMembership[moduleGenes,]) > 1){####进行这部分计算必须每个模块内基因数量大于2,由于前面设置了最小数量是30,这里可以不做这个判断,但是grey有可能会出现1个gene,它会导致代码运行的时候中断,故设置这一步
#sizeGrWindow(7, 7)
pdf(file=paste("mRNA/wgcna/model_trait/9_", trait, "_", module,"_Module membership vs gene significance.pdf",sep=""),width=7,height=7)
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, traitColumn]),
xlab = paste("Module Membership in", module, "module"),
ylab = paste("Gene significance for ",trait),
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()
}
}
}
#####
names(datExpr0)
probes = colnames(datExpr0)
#################export GS and MM###############
geneInfo0 = data.frame(probes= probes,
moduleColor = moduleColors)
for (Tra in 1:ncol(geneTraitSignificance))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneTraitSignificance[,Tra],
GSPvalue[, Tra])
names(geneInfo0) = c(oldNames,names(geneTraitSignificance)[Tra],
names(GSPvalue)[Tra])
}
for (mod in 1:ncol(geneModuleMembership))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneModuleMembership[,mod],
MMPvalue[, mod])
names(geneInfo0) = c(oldNames,names(geneModuleMembership)[mod],
names(MMPvalue)[mod])
}
geneOrder =order(geneInfo0$moduleColor)
geneInfo = geneInfo0[geneOrder, ]
write.table(geneInfo, file = "mRNA/wgcna/10_GS_and_MM.xls",sep="\t",row.names=F)
####################################################Visualizing the gene network#######################################################
nGenes = ncol(datExpr0)
nSamples = nrow(datExpr0)
# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^softPower
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA
# Call the plot function
#sizeGrWindow(9,9)
#pdf(file="mRNA/wgcna_tumor/12_Network heatmap plot_all gene.pdf",width=9, height=9)
#TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
#dev.off()
nSelect = 1000
# For reproducibility, we set the random seed
set.seed(10)
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
# There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
# Open a graphical window
#sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^softPower
diag(plotDiss) = NA
pdf(file="mRNA/wgcna/13_Network heatmap plot_selected genes.pdf",width=9, height=9)
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
dev.off()
####################################################Visualizing the gene network of eigengenes####################################################
#sizeGrWindow(5,7.5)
pdf(file="mRNA/wgcna/14_Eigengene dendrogram and Eigengene adjacency heatmap.pdf", width=5, height=7.5)
par(cex = 0.9)
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
dev.off()
#or devide into two parts
# Plot the dendrogram
#sizeGrWindow(6,6);
pdf(file="mRNA/wgcna/15_Eigengene dendrogram_2.pdf",width=6, height=6)
par(cex = 1.0)
plotEigengeneNetworks(MEs, "Eigengene dendrogram", marDendro = c(0,4,2,0), plotHeatmaps = FALSE)
dev.off()
pdf(file="mRNA/wgcna/15_Eigengene adjacency heatmap_2.pdf",width=6, height=6)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2), plotDendrograms = FALSE, xLabelsAngle = 90)
dev.off()
###########################Exporting to Cytoscape all one by one ##########################
# Select each module
for (mod in 1:nrow(table(moduleColors)))
{
modules = names(table(moduleColors))[mod]
# Select module probes
probes = colnames(datExpr0)
inModule = (moduleColors == modules)
modProbes = probes[inModule]
modGenes = modProbes
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("mRNA/wgcna/cyto_0.10/CytoscapeInput-edges-", modules , ".txt", sep=""),
nodeFile = paste("mRNA/wgcna/cyto_0.10/CytoscapeInput-nodes-", modules, ".txt", sep=""),
weighted = TRUE,
threshold = 0.10,
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = moduleColors[inModule])
}
附件