# Read in data setwd("e:\\research\\PLoS ONE submission\\Revised\\") dat = read.csv("PiwowarData.csv", header=TRUE, row.names=1) dim(dat) rownames(dat) colnames(dat) # Calculate a few fields which will be useful later dat$cohortmonthsfromend = max(dat$Number.of.months.between.1.99.and.trial.publication) - dat$Number.of.months.between.1.99.and.trial.publication dat$Number.of.cases.in.trial.gt25 = dat$Number.of.cases.in.trial > 25 # A quick summary of the data print("Number of papers") print("Data not available, Data available") tapply(dat$Number.of.Citations.during.2004.2005 > 1, dat$Is.the.microarray.data.publicly.available, sum) print("Number of citations") print("Data not available, Data available") tapply(dat$Number.of.Citations.during.2004.2005, dat$Is.the.microarray.data.publicly.available, sum) ## Table 1 # Note that the Fisher estimates of the odd's ratio are not exactly the same as a*d/b*c. The paper actually reports the latter, but uses the fisher values for the confidence interval impact.2x2 = table(dat$Impact.factor.of.journal < 25, !dat$Is.the.microarray.data.publicly.available) print(impact.2x2) impact.fisher = fisher.test(impact.2x2) print(round(impact.fisher$estimate, 1)) print(round(impact.fisher$conf.int, 1)) year.2x2 = table(dat$Number.of.months.between.1.99.and.trial.publication > 24, !dat$Is.the.microarray.data.publicly.available) print(year.2x2) year.fisher = fisher.test(year.2x2) print(round(year.fisher$estimate, 1)) print(round(year.fisher$conf.int, 1)) usauth.2x2 = table(dat$Are.there.any.authors.from.the.US == 0, !dat$Is.the.microarray.data.publicly.available) print(usauth.2x2) usauth.fisher = fisher.test(usauth.2x2) print(round(usauth.fisher$estimate, 1)) print(round(usauth.fisher$conf.int, 1)) ### Table 2 ## Primary analysis # Some helper functions calcCI.exp= function(res, param) { coefs = summary(res)$coeff coeff = coefs[param,] x = coeff[1] stderr = coeff[2] p = coeff[4] return(list(param = param, est = round(exp(x), 2), CI = c(round(exp(x - 1.96*stderr), 2), round(exp(x + 1.96*stderr), 2)), p = round(p, 3))) } calcCI.noexp= function(res, param) { coefs = summary(res)$coeff coeff = coefs[param,] x = coeff[1] stderr = coeff[2] p = coeff[4] return(list(param = param, est = round(x, 2), CI = c(round(x - 1.96*stderr, 2), round(x + 1.96*stderr, 2)), p = round(p, 3))) } all.results = function(res) { # give the results of the impact factor without exp because it is the # log impact factor, so interpretation is easier if kept in the log domain print(calcCI.noexp(res, "lnimpact")) print(calcCI.exp(res, "Are.there.any.authors.from.the.US")) print(calcCI.exp(res, "Number.of.months.between.1.99.and.trial.publication")) print(calcCI.exp(res, "Is.the.microarray.data.publicly.available")) } # Take the log of the endpoints and impact factor dat$lnimpact = log(dat$Impact.factor.of.journal) dat$lncites0405 = log(dat$Number.of.Citations.during.2004.2005) dat$lncites24months = log(dat$Number.of.Citations.in.first.24.months.after.publication) # Define the lower-profile subset which.subset = which((dat$Impact.factor.of.journal < 25) & (dat$Number.of.months.between.1.99.and.trial.publication > 24)) dat.subset = dat[which.subset,] # A quick summary of the lower-profile subset print("Number of papers in the lower-profile subset") print("Data not available, Data available") tapply(dat.subset$Number.of.Citations.during.2004.2005 > 1, dat.subset$Is.the.microarray.data.publicly.available, sum) print("Number of citations in the lower profile-subset") print("Data not available, Data available") tapply(dat.subset$Number.of.Citations.during.2004.2005, dat.subset$Is.the.microarray.data.publicly.available, sum) # Do the regressions result.primary = lm(lncites0405 ~ lnimpact + Number.of.months.between.1.99.and.trial.publication + Are.there.any.authors.from.the.US + Is.the.microarray.data.publicly.available, dat) all.results(result.primary) result.primary.24mo = lm(lncites24months ~ lnimpact + Number.of.months.between.1.99.and.trial.publication + Are.there.any.authors.from.the.US + Is.the.microarray.data.publicly.available, dat) all.results(result.primary.24mo) result.primary.subset = lm(lncites0405 ~ lnimpact + Number.of.months.between.1.99.and.trial.publication + Are.there.any.authors.from.the.US + Is.the.microarray.data.publicly.available, dat.subset) all.results(result.primary.subset) # Table 3 # Exploratory results # Articles, No. is confirmed below # Citations, No. is therefore not confirmed; assumed to be correct based on prior computation # coef is raised to 10 and called "fold increase" since covariates are binary # note the new p-values # Use subset that makes data available which.Is.the.microarray.data.publicly.available = which(dat$Is.the.microarray.data.publicly.available == 1) dat.da = dat[which.Is.the.microarray.data.publicly.available,] n.Is.the.microarray.data.publicly.available = length(which.Is.the.microarray.data.publicly.available) # Do the calculations result.expl.n = lm(lncites0405 ~ Number.of.cases.in.trial.gt25 + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.n$call) print(round(exp(summary(result.expl.n)$coeff[2,]), 2)) tab = table(dat.da$Number.of.cases.in.trial.gt25); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$Number.of.cases.in.trial.gt25, sum); tab; round(tab/sum(tab), 2) result.expl.clinical = lm(lncites0405 ~ Trial.has.a.clinical.endpoint + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.clinical$call) print(round(exp(summary(result.expl.clinical)$coeff[2,]), 2)) tab = table(dat.da$Trial.has.a.clinical.endpoint); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$Trial.has.a.clinical.endpoint, sum); tab; round(tab/sum(tab), 2) result.expl.affy = lm(lncites0405 ~ Uses.the.Affymetrix.microarray.platform + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.affy$call) print(round(exp(summary(result.expl.affy)$coeff[2,]), 2)) tab = table(dat.da$Uses.the.Affymetrix.microarray.platform); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$Uses.the.Affymetrix.microarray.platform, sum); tab; round(tab/sum(tab), 2) result.expl.geo = lm(lncites0405 ~ In.the.GEO.database + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.geo$call) print(round(exp(summary(result.expl.geo)$coeff[2,]), 2)) tab = table(dat.da$In.the.GEO.database); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$In.the.GEO.database, sum); tab; round(tab/sum(tab), 2) result.expl.smd = lm(lncites0405 ~ In.the.SMD.database + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.smd$call) print(round(exp(summary(result.expl.smd)$coeff[2,]), 2)) tab = table(dat.da$In.the.SMD.database); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$In.the.SMD.database, sum); tab; round(tab/sum(tab), 2) result.expl.raw = lm(lncites0405 ~ Raw.data.such.as.CEL.files.are.available + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.raw$call) print(round(exp(summary(result.expl.raw)$coeff[2,]), 2)) tab = table(dat.da$Raw.data.such.as.CEL.files.are.available); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$Raw.data.such.as.CEL.files.are.available, sum); tab; round(tab/sum(tab), 2) result.expl.suppl = lm(lncites0405 ~ Publication.mentions.Supplemental.data + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.suppl$call) print(round(exp(summary(result.expl.suppl)$coeff[2,]), 2)) tab = table(dat.da$Publication.mentions.Supplemental.data); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$Publication.mentions.Supplemental.data, sum); tab; round(tab/sum(tab), 2) result.expl.oncomine = lm(lncites0405 ~ Publication.has.an.Oncomine.profile + lnimpact + Are.there.any.authors.from.the.US + Number.of.months.between.1.99.and.trial.publication, subset=which.Is.the.microarray.data.publicly.available, dat) print(result.expl.oncomine$call) print(round(exp(summary(result.expl.oncomine)$coeff[2,]), 2)) tab = table(dat.da$Publication.has.an.Oncomine.profile); tab; round(tab/sum(tab), 2) tab = tapply(dat.da$Number.of.Citations.during.2004.2005, dat.da$Publication.has.an.Oncomine.profile, sum); tab; round(tab/sum(tab), 2) ### Figure 1 table(dat$Is.the.microarray.data.publicly.available) boxplot(Number.of.Citations.during.2004.2005 ~ Is.the.microarray.data.publicly.available, data = dat, boxwex = 0.5, names=c("Data Not Shared (n=44)", "Data Shared (n=41)"), ylab = "Number of Citations in 2004-2005", outline=T, notch=F, log="y") dev.copy(postscript, file="figure1.eps", width=6, height=6, horizontal=F, onefile=F) dev.off() table(dat.subset$Is.the.microarray.data.publicly.available) windows() boxplot(Number.of.Citations.during.2004.2005 ~ Is.the.microarray.data.publicly.available, data = dat.subset, boxwex = 0.5, names=c("Data Not Shared (n=43)", "Data Shared (n=27)"), ylab = "Number of Citations in 2004-2005", outline=T, notch=F, log="y") dev.copy(postscript, file="figure2.eps", width=6, height=6, horizontal=F, onefile=F) dev.off()