##Data processing

##install lumi package
source("http://bioconductor.org/biocLite.R")
biocLite("lumi")
library(lumi)

## specify the file name
fileName <- "FinalReport_SampleProbe.txt"

## load the data
x.lumi <- lumiR(fileName, convertNuID = FALSE)

## quality control 
x.lumi <- lumiQ(x.lumi)

## summary of the quality control
summary(x.lumi, 'QC')

## Log2 transformation
x.lumiT <-lumiT(x.lumi, method = "log2")

## data normalization 
x.lumiN <- lumiN(x.lumiT, method = "RSN")

## quality control after normalization
x.lumiNQ <- lumiQ(x.lumiN)

## summary of the quality control
summary(x.lumiNQ, 'QC')

##remove the unexpressed and un-annotated genes
presentCount <- detectionCall(x.lumi)
dataMatrix <- exprs(x.lumiNQ)
dataMatrixF <- dataMatrix[presentCount > 0,]

## Output the data as Tab separated text file
write.table(dataMatrixF, file= "processedData.txt")

##Organize columns as samples are in groups as follows: VL patients, asymptomatic individuals, uninfected controls and treated individuals. 

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##Identify dierentially expressed genes
##install lumi package
source("http://bioconductor.org/biocLite.R")
biocLite("limma")
library(limma)

probeList <- rownames(dataMatrixF)
design <- model.matrix(~ 0+factor(c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,4,4,4,4,4,4,4,4)))
colnames(design) <- c("VL", "DTH", "CTRL", "TRT")

## call lmFit on your data
fit <- lmFit(data_matrix, design)
contrast.matrix <- makeContrasts(VL-DTH, VL-CTRL, TREATED-VL, TREATED-CTRL, TREATED-DTH, DTH-CTRL, levels=design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)

## get gene SYMBOL
source("http://bioconductor.org/biocLite.R")
biocLite("illuminaHumanv4.db")
library(illuminaHumanv4.db)

source("http://bioconductor.org/biocLite.R")
biocLite("annotate")
library(annotate)

geneSymbol <- getSYMBOL(probeList, 'illuminaHumanv4.db')
fit2$genes <- data.frame(ID= probeList, geneSymbol=geneSymbol)

#get differentially expressed genes
VL-DTH <- topTable(fit2, coef=1, adjust="BH", number = Inf)
VL-CTRL <- topTable(fit2, coef=2, adjust="BH", number = Inf)
TREATED-VL <- topTable(fit2, coef=3, adjust="BH", number = Inf)
TREATED-CTRL <- topTable(fit2, coef=4, adjust="BH", number = Inf)
TREATED-DTH <- topTable(fit2, coef=5, adjust="BH", number = Inf)
DTH-CTRL <- topTable(fit2, coef=6, adjust="BH", number = Inf)

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##WGCNA
##Filter data for the most variant genes from the dataset (in the paper the 3700 most variant genes were used)
library(WGCNA)
options(stringsAsFactors = FALSE)

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))

# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)

# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;

# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");

# this line corresponds to using an R^2 cut-off of h
abline(h=0.80,col="red")

# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

#Calculate the adjacencies,
softPower = 12
adjacency = adjacency(datExpr, power = softPower)

# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");

# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);

# Set the minimum module size relatively high:
minModuleSize = 30;

# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)

# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)

# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")

##Merge modules

# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes

# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);

# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");

# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.25

# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")

# Call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)

# The merged module colors
mergedColors = merge$colors;

# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;

##Plot new dendrogram with merged modules
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)

# Rename to moduleColors
moduleColors = mergedColors

# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;

# Save module colors and labels for use in subsequent parts
save(MEs, moduleLabels, moduleColors, geneTree, file = "Merged_modules.txt"
----------------------------------------------------------------------------------------------------------------------------------------------------------------

##Cell deconvolotuion
source('http://www.bioconductor.org/biocLite.R')
biocLite('CellMix', siteRepos = 'http://web.cbio.uct.ac.za/~renaud/CRAN', type='both')
library(CellMix)
library(GEOquery)


#marker gene list
m1 <- MarkerList("HaemAtlas")

#Load target data (expression data)
target1 = as.matrix(target)

#convert IDs
m1 <- convertIDs(m1, target1)

##Compute proportion proxies as mean expression cell profile
meanProf <- ged(target1, m1, method = "meanProfile")

# Proportion proxies are stored in the coefficient matrix
coef(meanProf)