## Estimate county-level Arsenic levels, directly-standardized lung cancer incidence and smoking rates for each county in the study area. ## Estimate association and interaction between exposure, smoking, median income, and lung cancer incidence. ## As per PONE-D-11-06364 ## Association of Arsenic Exposure with Lung Cancer Incidence Rates in the United States PLoS ONE ## Putila J and Guo NL ## MBR Cancer Center and WVU Department of Community Medicine ## Room 1814 ## Morgantown WV, 26506 rm(list=ls()) gc() ## Import Population Data ## County-level estimates only from the 2000 U.S. Census CApop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/CA/2kh06_mod.csv") CTpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/CT/2kh09_mod.csv") IApop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/IA/2kh19_mod.csv") KYpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/KY/2kh21_mod.csv") LApop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/LA/2kh22_mod.csv") NJpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/NJ/2kh34_mod.csv") NMpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/NM/2kh35_mod.csv") UTpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/UT/2kh49_mod.csv") ## Remove extraneous character from county name IApop$Area <- ifelse(IApop$Area=="O'Brien County, Iowa", "OBrien County, Iowa", as.character(factor(IApop$Area))) ## Combine individual population files Allpop <- rbind(CApop, CTpop, IApop, KYpop, LApop, NJpop, NMpop, UTpop) ## Total number of counties thus far NUMCOUNT <- length(Allpop$Area)[1] NUMCOUNT ## Should be 432 ## Shortened name for the county CountyShort <- as.character(substr(Allpop$Area, 1,8)) ## Move county name to end of data frame Area <- Allpop$Area Allpop <- Allpop[,-1] Allpop <- data.frame(Allpop, CountyShort, Area) ## Read population data for the U.S. US <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Census/US/2kh00_mod.csv") ## Remove row name, total, males, females columns US <- US[,-c(1:4)] ## Import Incidence Data from SEER ## IA,KY,LA,NJ,NM,UT,CA,CT Allinc <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/SEER/all_dca.txt") ## Only 2000+ CA entries Allinc <- Allinc[substr(Allinc$StateCounty,1,2)!="CA" | Allinc$YearDx>=2000,] ## Missouri, MO - age adjusted MOinc <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/MO/mica_mod.csv") MOinc$County <- paste(as.character(factor(MOinc$County)), ", Missouri", sep="") MOinc$FIPSCode <- ifelse(as.numeric(MOinc$FIPSCode) < 10 , paste(0, MOinc$FIPSCode, sep=""), MOinc$FIPSCode) MOinc$FIPSCode <- ifelse(as.numeric(MOinc$FIPSCode) < 100, paste(0, MOinc$FIPSCode, sep=""), MOinc$FIPSCode) MOinc$FIPS <- paste("29", MOinc$FIPSCode, sep="") MOpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/MO/2kh29_mod.csv", header=TRUE) ## Ohio, OH - age adjusted OHinc <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/OH/OHinc.csv") OHinc$County <- paste(as.character(factor(OHinc$County)), " County, Ohio", sep="") OHinc$FIPSCode <- ifelse(as.numeric(OHinc$FIPSCode) < 10 , paste(0, OHinc$FIPSCode, sep=""), OHinc$FIPSCode) OHinc$FIPSCode <- ifelse(as.numeric(OHinc$FIPSCode) < 100, paste(0, OHinc$FIPSCode, sep=""), OHinc$FIPSCode) OHinc$FIPS <- paste("39", OHinc$FIPSCode, sep="") OHpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/OH/2kh39_mod.csv", header=TRUE) ## West Virginia, WV - age adjusted WVinc <- read.table("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/WV/wv_inc.txt", sep=" ", header=TRUE) WVfips <- read.table("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/WV/wv_fips.txt", sep="\t", header=TRUE) WVpop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/WV/2kh54_mod.csv", header=TRUE) WVinc <- data.frame(WVfips[,2], WVfips[,1], WVinc[,4]) colnames(WVinc) <- c("County", "FIPS", "AdjRate") WVinc$County <- paste(WVinc$County, ", West Virginia", sep="") ## Pennsylvania, PA - age adjusted PAinc <- read.table("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/PA/pa_inc.txt", sep="\t", header=TRUE) PAinc$County <- paste(PAinc$County, "County, Pennsylvania", sep="") PApop <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/PA/2kh42_mod.csv") PAfips <- read.table("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/PA/pa_fips.txt", sep="\t", header=TRUE) PAinc <- data.frame(PAinc[,1], PAfips[,1], PAinc[,10]) #Fix extraneous issues in SEER naming of StateCounty Allinc$StateCounty <- ifelse(Allinc$StateCounty=="CA: Los Angeles Registry (06037)" , "CA: Los Angeles County (06037)", as.character(factor(Allinc$StateCounty))) Allinc$StateCounty <- ifelse(Allinc$StateCounty=="NM: Cibola County (35006) - 1982+", "NM: Cibola County (35006)", as.character(factor(Allinc$StateCounty))) Allinc$StateCounty <- ifelse(Allinc$StateCounty=="NM: Valencia County (35061) - 1982+", "NM: Valencia County (35061)", as.character(factor(Allinc$StateCounty))) ## Create a FIPS code for the SEER data FIPS <- substr(Allinc$StateCounty, nchar(as.character(Allinc$StateCounty))-5, nchar(as.character(Allinc$StateCounty))-1) CountyShort <- substr(Allinc$StateCounty, 5, 12) ## Convert state abbreviation to full name StateT <- substr(Allinc$StateCounty, 1, 2) State <- ifelse(StateT=="CA", ", California", ifelse(StateT=="CT", ", Connecticut", ifelse(StateT=="LA", ", Louisiana", ifelse(StateT=="IA", ", Iowa", ifelse(StateT=="KY", ", Kentucky", ifelse(StateT=="NJ", ", New Jersey", ifelse(StateT=="UT", ", Utah", ifelse(StateT=="NM", ", New Mexico", "ERROR" )))))))) Area <- paste(substr(Allinc$StateCounty, 5, nchar(as.character(Allinc$StateCounty))-8), State, sep="") Allinc <- data.frame(Allinc, CountyShort, FIPS, Area) Allinc <- Allinc[Allinc$CountyShort!="Unknown ",] rm(StateT) ######################## ## Population Weights ## ######################## ## Combined Population for all 12 states. Pop.1 <- rbind(CApop, CTpop, IApop, KYpop, LApop, NJpop, NMpop, UTpop, MOpop, OHpop, WVpop, PApop) rm(CApop, CTpop, IApop, KYpop, LApop, NJpop, NMpop, UTpop, OHpop, MOpop, WVpop, PApop) ## Sum the population of each age group to get the total population Pop <- matrix(NA, nrow=dim(Pop.1)[1], ncol=2) for(i in 1:dim(Pop.1)[1]){ Pop[i,1] <- as.character(factor(Pop.1[i,1])) Pop[i,2] <- sum(Pop.1[i,2:14]) } colnames(Pop) <- c("PopCounty", "Population") ############################# ## Calculate Raw Incidence ## ############################# ## Lung Cancer diagnosed between 1996 and 2005 ## Obtained from the NCI-SEER database. ## Age Groups Pairs agroups <- c(0,4,5,9,10,14,15,19,20,24,25,34,35,44,45,54,55,59,60,64,65,74,75,84,85,130) crate <- matrix(rep(NA,NUMCOUNT*13), NUMCOUNT, 13) County <- matrix(rep(NA, NUMCOUNT), NUMCOUNT, 1) FIPS <- matrix(rep(NA, NUMCOUNT), NUMCOUNT, 1) ## Years of data available ## Either 6 or 10 depending upon when the state entered SEER yearsdata <- NA for(i in 1:NUMCOUNT){ yearsdata[i] <- ifelse(unique(substr(Allinc[as.character(Allinc$Area)==as.character(Allpop[i,]$Area),]$StateCounty,1,2)) %in% c("CT", "IA", "NM", "UT"), 10, ifelse(unique(substr(Allinc[as.character(Allinc$Area)==as.character(Allpop[i,]$Area),]$StateCounty,1,2)) %in% c("CA", "LA", "KY", "NJ"), 6, NA)) } ## SEER Areas ## Calculates the crude rate (crate) by taking the total number of cases and dividing that by the number of years (6 or 10) ## then multiplies by 100,000 to get the average incidence rate per 100,000 persons in each of the j=1:13 age groups for(i in 1:NUMCOUNT){ crateRow <- NA for(j in 1:13){ crate[i,j] <- ((dim(Allinc[as.character(Allinc$Area)==as.character(Allpop[i,]$Area) & (Allinc$YearDx <= 2005 & Allinc$YearDx >= 1996) & (Allinc$AgeDx >= agroups[j+(j-1)] & Allinc$AgeDx <= agroups[j+j] ), ])[1] / yearsdata[i]) / Allpop[i,j])*100000 } County[i] <- as.character(Allpop[i,15]) FIPS[i] <- as.character(unique(factor(Allinc[as.character(Allinc$Area)==as.character(factor(Allpop[i,]$Area)),]$FIPS))) } ## Direct Standardization to the 2000 U.S. Population Allstd <- matrix(NA, nrow=NUMCOUNT, ncol=13) AdjRate <- NA ## Directly standardizes the rate to the 2000 U.S. population, using the US census data for(i in 1:NUMCOUNT){ for(j in 1:13){ Allstd[i,j] <- (crate[i,j] * US[1,j]) } AdjRate[i] <- (sum(Allstd[i,]) / sum(US[1,])) } ## Combine the calculated rates with the published rates temp <- data.frame(County, FIPS, AdjRate) colnames(temp) <- c ("County", "FIPS", "AdjRate") colnames(MOinc) <- c("County", "AdjRate", "Fipsshort", "FIPS") colnames(OHinc) <- c("County", "AdjRate", "Fipsshort", "FIPS") colnames(WVinc) <- c("County", "FIPS", "AdjRate") colnames(PAinc) <- c("County", "FIPS", "AdjRate") ## Combine SEER and independent incidence full <- rbind(temp, MOinc[,c(1,4,2)], OHinc[,c(1,4,2)], as.matrix(WVinc), as.matrix(PAinc)) ## Add SES data full <- cbind(full, Pop) rm(temp) ## Update the number of counties, now 757 NUMCOUNT <- dim(full)[1] NUMCOUNT ## Remove Sets rm(MOinc, OHinc, WVinc, PAinc) ###################### ## Read in SES data ## ###################### ## Columns to read from each set wide <- c(2 , 4, 127, 6, 53, 44, 23) colname <- c("SFIPS", "CFIPS", "Junk1", "MedIncome", "Junk2", "SESName", "Junk3") ## Data sets CAses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/CA/est00_CA.dat", widths = wide, col.names = colname) CAses <- CAses[2:59,c(1,2,4,6)] CTses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/CT/est00_CT.dat", widths = wide, col.names = colname) CTses <- CTses[2:9,c(1,2,4,6)] IAses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/IA/est00_IA.dat", widths = wide, col.names = colname) IAses <- IAses[2:100,c(1,2,4,6)] KYses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/KY/est00_KY.dat", widths = wide, col.names = colname) KYses <- KYses[2:121,c(1,2,4,6)] LAses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/LA/est00_LA.dat", widths = wide, col.names = colname) LAses <- LAses[2:65,c(1,2,4,6)] NJses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/NJ/est00_NJ.dat", widths = wide, col.names = colname) NJses <- NJses[2:22,c(1,2,4,6)] NMses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/NM/est00_NM.dat", widths = wide, col.names = colname) NMses <- NMses[2:34,c(1,2,4,6)] UTses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Census/UT/est00_UT.dat", widths = wide, col.names = colname) UTses <- UTses[2:30,c(1,2,4,6)] OHses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/OH/est00_OH.dat", widths = wide, col.names = colname) OHses <- OHses[2:89,c(1,2,4,6)] MOses <- read.table("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/MO/est00_MO.dat", sep="\t") MOses <- MOses[,c(1,2,21,24)] colnames(MOses) <- c("SFIPS", "CFIPS", "MedIncome", "SESName") WVses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/WV/est00_WV.dat", widths = wide, col.names = colname) WVses <- WVses[2:56, c(1,2,4,6)] PAses <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/Other Inc/PA/est00_PA.dat", widths = wide, col.names = colname) PAses <- PAses[2:68, c(1,2,4,6)] SES <- rbind(CAses, CTses, IAses, KYses, LAses, NJses, NMses, UTses, MOses, OHses, WVses, PAses) ## Add leading zeroes to the State/County FIPS Code SES$CFIPS <- ifelse(as.numeric(SES$CFIPS) < 10, paste(0, SES$CFIPS, sep=""), SES$CFIPS) SES$CFIPS <- ifelse(as.numeric(SES$CFIPS) < 100, paste(0, SES$CFIPS, sep=""), SES$CFIPS) full <- cbind(full, SES) ##################### ## Read in BRFSS data ## ##################### wide <- c(2, 95, 1, 1, 1, 2, 29, 3, 1158) #max 1293 colname <- c("State", "Junk1", "Smoke100", "SmokeDay", "Quit", "Age", "Junk3", "County", "Junk4") ## Read in the data brfss.pre <- read.fwf("C:/Users/jputila/Desktop/Heavy Metal Data/BRFSS/Year08/CDBRFS08.ASC", widths = wide, col.names = colname) ## Remove extraneous variables and observations brfss <- brfss.pre[,c(1,3,4,5,6,8)] #rm(brfss.pre) brfss$State <- ifelse(brfss$State < 10, paste("0", brfss$State, sep=""), brfss$State) brfss <- brfss[(brfss$State=="06" | brfss$State=="09" | brfss$State=="19" | brfss$State=="21" | brfss$State=="22" | brfss$State=="29" | brfss$State=="34" | brfss$State=="35" | brfss$State=="39" | brfss$State=="49" | brfss$State=="54" | brfss$State=="42") & !is.na(brfss$State),] ## Only adults brfss <- brfss[brfss$Age>=18,] ## Add leading zeroes to the County FIPS Code brfss$County <- ifelse(as.numeric(brfss$County) < 10 , paste(0, brfss$County, sep=""), brfss$County) brfss$County <- ifelse(as.numeric(brfss$County) < 100, paste(0, brfss$County, sep=""), brfss$County) ## Get rid entries with county suppressed due to small sample brfss <- brfss[!is.na(brfss$County),] ## Code smoking status ## Smoked 100 cigarettes in lifetime, currently smokes daily, or currently smokes daily/some days smk100 <- ifelse(brfss$Smoke100==1, 1, ifelse(brfss$Smoke100==2,0, NA)) smkDay <- ifelse(brfss$SmokeDay==1, 1, ifelse(smk100==0 | brfss$SmokeDay==2 | brfss$SmokeDay==3, 0, NA)) smkAny <- ifelse(brfss$SmokeDay==1 | brfss$SmokeDay==2, 1, ifelse(brfss$SmokeDay==3 | smk100==0, 0, NA)) scfips <- paste(brfss$State, brfss$County, sep="") rateDay <- matrix(rep(NA, NUMCOUNT), NUMCOUNT) rateAny <- matrix(rep(NA, NUMCOUNT), NUMCOUNT) rate100 <- matrix(rep(NA, NUMCOUNT), NUMCOUNT) for(i in 1:NUMCOUNT){ rate100[i] <- (dim(brfss[scfips==as.character(factor(full[i,2])) & smk100==1 & !is.na(smk100),])[1] / dim(brfss[scfips==as.character(factor(full[i,2])),])[1]) } full <- cbind(full, rate100) ############################### ## Calculate Metal Concentrations ## ############################### ## Import Metals data metal <- read.csv("C:/Users/jputila/Desktop/Heavy Metal Data/geochem-2009-10-22-14-17-04.txt") metal$FIPS <- substr(metal$FIPS, 2, 6) ## Get rid of invalid pH values metal$PH <- ifelse(metal$PH == 88, NA, metal$PH) ## County FIPS Identifier to confirm accuracy of matching scheme MetalName <- matrix(NA, nrow = NUMCOUNT, ncol = 1) for(i in 1:NUMCOUNT){ MetalName[i] <- as.character(factor(full[i,]$County))} ## Arsenic ## Calculate As values ## Do not include "-20" codes in estimates Ascounty <- matrix(NA, nrow = NUMCOUNT, ncol = 1) for(i in 1:NUMCOUNT){ Ascounty[i] <- mean(c(na.omit(metal[metal[,42] == as.character(factor(full[i,]$FIPS)) & metal$AS_ICP40 >= 0,]$AS_ICP40) , na.omit(metal[metal[,42] == as.character(factor(full[i,]$FIPS)) & metal$AS_AA >= 0 ,]$AS_AA), na.omit(metal[metal[,42] == as.character(factor(full[i,]$FIPS)) & metal$AS_INAA >= 0 ,]$AS_INAA))) } Ascounty <- ifelse(Ascounty==0, NA, Ascounty) ## Final data set full <- cbind(full, MetalName, Ascounty) ## Remove metal Dsets #rm(MetalName, Ascounty) ##################### ## Statistical Analyses ## ##################### ## GGPlot package library(ggplot2) ## Confidence intervals cconf <- function(model){ numcov <- dim(coef(summary(model)))[1] uci <- NA lci <- NA for(i in 1:numcov){ lci[i] <- exp(coef(summary(model))[i,1] - 1.96*coef(summary(model))[i,2]) uci[i] <- exp(coef(summary(model))[i,1] + 1.96*coef(summary(model))[i,2]) } outT <- data.frame(c(dimnames(coef(summary(model)))[1]), lci, exp(coef(summary(model))[,1]), uci) names(outT) <- c("Variable", "LowerCI", "Estimate", "UpperCI") outT } ############################ ## Read in supplementary data ## ############################ ## Read in data if saved previously ## full <- read.csv("C:/Users/jputila/Desktop/PLoS_Supplementary/full_8_18.csv") ## Remove Column Header ## full <- full[,-1] AdjRatebak <- full$AdjRate full$AdjRate <- round(as.numeric(full$AdjRate),0) ## Assign smoking rate (>=100 cigs over lifetime) full$smkrate <- full$rate100 ####################### ## Measure Associations ## ####################### ## Generate centered and transformed variables full$lnAs <- log(full$Ascounty) - mean(na.omit(log(full$Ascounty))) full$lnInc <- log(full$MedIncome) - mean(na.omit(log(full$MedIncome))) full$Population <- as.numeric(as.character(full$Population)) full$lnsmk <- full$smkrate ## Bivariate, Untransformed ## Arsenic Levels and Lung Cancer Incidence, weighted glm1 <- glm(full$AdjRate ~ full$Ascounty, family=poisson, weights=as.numeric(full$Population)) summary(glm1) cconf(glm1) plot(y=log(full$AdjRate), x=full$Ascounty, ylab="ln(Cancer Incidence per 10,000,000)", xlab="ln(As [ppm])", main="Association between Chromium and Lung Cancer Incidence") abline(a=glm1$coef[1], b=glm1$coef[2], col=1) glm1T <- data.frame(glm1$linear.predictors, glm1$y, glm1$model[3]) glm1gg <- ggplot(glm1T, aes(x=glm1.linear.predictors, y=log(glm1.y))) glm1gg + geom_point(aes(size=X.weights.)) + geom_abline(coef(glm(log(full$AdjRate) ~ full$Ascounty, family=poisson, weights=as.numeric(full$Population)))) ## Adjusted, Untransformed ## Arsenic, Smoking, SES SESassmk <- glm(full$AdjRate ~ full$smkrate + full$Ascounty + full$MedIncome, family=poisson, weights=as.numeric(full$Population)) summary(SESassmk) cconf(SESassmk) ## Estimate the 25, 50, amd 75% quartile points for each variable for the quartiles interaction models AsCut <- NA AsCut[1] <- as.numeric(summary(full$lnAs)[2]) AsCut[2] <- as.numeric(summary(full$lnAs)[3]) AsCut[3] <- as.numeric(summary(full$lnAs)[5]) SmkCut <- NA SmkCut[1] <- as.numeric(summary(full$lnsmk)[2]) SmkCut[2] <- as.numeric(summary(full$lnsmk)[3]) SmkCut[3] <- as.numeric(summary(full$lnsmk)[5]) SESCut <- NA SESCut[1] <- as.numeric(summary(full$lnInc)[2]) SESCut[2] <- as.numeric(summary(full$lnInc)[3]) SESCut[3] <- as.numeric(summary(full$lnInc)[5]) ## Continuous Interaction Models ## Arsenic and Smoking AsSmk <- full$lnAs * full$lnsmk intAsSmk <- aov(full$AdjRate ~ full$lnsmk + full$lnAs + full$lnInc + AsSmk, weights=as.numeric(full$Population)) summary(intAsSmk) ## Arsenic and SES AsSES <- full$lnAs * full$lnInc intAsSES <- aov(full$AdjRate ~ full$lnsmk + full$lnAs + full$lnInc + AsSES, weights=as.numeric(full$Population)) summary(intAsSES) ## Calculate Strat Groups ## ## Smoking Quartiles smkgrp <- ifelse(is.na(full$lnsmk), NA, ifelse(full$lnsmk < SmkCut[1], 1, ifelse(full$lnsmk >= SmkCut[1] & full$lnsmk < SmkCut[2], 2, ifelse(full$lnsmk >= SmkCut[2] & full$lnsmk < SmkCut[3], 3, 4)))) ## SES Low-Income Cutoffs SESgrp <- ifelse(is.na(full$MedIncome), NA, ifelse(full$MedIncome < 24000 & !is.na(full$MedIncome), 1, ifelse(full$MedIncome>= 24000 & full$MedIncome< 28700, 2, ifelse(full$MedIncome >= 28700 & full$MedIncome < 38300, 3, 4)))) ## SES Quartiles #SESgrp <- ifelse(full$lnInc< -0.158, 1, ifelse(full$lnInc>=-0.158 & full$lnInc< -0.00391, 2, ifelse(full$lnInc>=-0.00391 & full$lnInc <0.1478, 3, 4))) ## Arsenic Quartiles AsQ <- ifelse(is.na(full$lnAs), NA, ifelse(full$lnAs < AsCut[1], 1, ifelse(full$lnAs >= AsCut[1] & full$lnAs < AsCut[2], 2, ifelse(full$lnAs >= AsCut[2] & full$lnAs < AsCut[3], 3, 4)))) ## Quartile-Based Interaction Models ## Convert quartiles to factors AsQf <- as.factor(AsQ) smkgrpf <- as.factor(smkgrp) smkgrpfbak <- smkgrpf SESgrpf <- as.factor(SESgrp) ############# ## ARSENIC ## ############# ## Arsenic and Smoking smkgrpf <- smkgrpfbak smkgrpf <- ifelse(is.na(smkgrpf), NA, ifelse(smkgrpf==1 | smkgrpf==2, 1, 2)) intAsSmk <- aov(full$AdjRate ~ SESgrpf + AsQf*smkgrpf, weights=as.numeric(full$Population)) summary(intAsSmk) ## Without SES intAsSmk2 <- aov(full$AdjRate ~ AsQf*smkgrpf, weights=as.numeric(full$Population)) summary(intAsSmk2) ## Arsenic and SES intAsSES <- aov(full$AdjRate ~ smkgrpf + AsQf*SESgrpf, weights=as.numeric(full$Population)) summary(intAsSES) ## GLMS for smoking levels WITHOUT SES ## Bottom 50% vs Top 50% smkgrpf <- smkgrpfbak smkgrpf <- ifelse(is.na(smkgrpf), NA, ifelse(smkgrpf==1 | smkgrpf==2, 1, 2)) r1 <- glm(full[smkgrpf==1,]$AdjRate ~ full[smkgrpf==1,]$lnAs, family=poisson, weights=as.numeric(full[smkgrpf==1,]$Population)) summary(r1) r2 <- glm(full[smkgrpf==2,]$AdjRate ~ full[smkgrpf==2,]$lnAs, family=poisson, weights=as.numeric(full[smkgrpf==2,]$Population)) summary(r2) ## Plot the Interaction between Arsenic and Smoking WITHOUT SES data1 <- cbind( c(t(r1$model[2]),t(r2$model[2])), c(log(r1$fitted.values),log(r2$fitted.values)), c(r1$weights, r2$weights), c(rep("1",dim(r1$model[2])[1]), rep("2",dim(r2$model[2])[1]))) data1 <- as.data.frame(data1, stringsAsFactors=FALSE) names(data1) <- c("logAs","logRate", "weight", "smkgrp") data1$logAs <- as.numeric(as.character(data1$logAs)) data1$logRate <- as.numeric(as.character(data1$logRate)) data1$weight <- as.numeric(as.character(data1$weight)) data1$smkgrp <- as.numeric(as.character(data1$smkgrp)) data1$adjinc <- c(as.numeric(coef(r1)[2])*r1$model[,2]*0, as.numeric(coef(r2)[2])*r2$model[,2]*0) data1$adjrate <- data1$adjinc+data1$logRate assmkp <- ggplot(data1, aes(x=logAs, y=adjrate, shape=factor(smkgrp), color=factor(smkgrp))) assmkp + stat_smooth(method = "glm", level=0.95, alpha=1, fill="grey80", color="black") + #scale_color_manual(values=c("grey50","grey70")) + geom_point(aes(size=weight)) + geom_point() + opts(legend.position = "right") + theme_bw() ## GLMS for smoking levels WITH SES smkgrpf <- smkgrpfbak ## Bottom 50% vs Top 50% smkgrpf <- ifelse(is.na(smkgrpf), NA, ifelse(smkgrpf==1 | smkgrpf==2, 1, 2)) r1 <- glm(full[smkgrpf==1,]$AdjRate ~ full[smkgrpf==1,]$lnAs + full[smkgrpf==1,]$MedIncome, family=poisson, weights=as.numeric(full[smkgrpf==1,]$Population)) summary(r1) r2 <- glm(full[smkgrpf==2,]$AdjRate ~ full[smkgrpf==2,]$lnAs + full[smkgrpf==2,]$MedIncome, family=poisson, weights=as.numeric(full[smkgrpf==2,]$Population)) summary(r2) ## Plot the Interaction between Arsenic and Smoking with SES data1 <- cbind( c(t(r1$model[2]),t(r2$model[2])), c(log(r1$fitted.values),log(r2$fitted.values)), c(r1$weights, r2$weights), c(rep("1",dim(r1$model[2])[1]), rep("2",dim(r2$model[2])[1]))) data1 <- as.data.frame(data1, stringsAsFactors=FALSE) names(data1) <- c("logAs","logRate", "weight", "smkgrp") data1$logAs <- as.numeric(as.character(data1$logAs)) data1$logRate <- as.numeric(as.character(data1$logRate)) data1$weight <- as.numeric(as.character(data1$weight)) data1$smkgrp <- as.numeric(as.character(data1$smkgrp)) data1$adjinc <- c(as.numeric(coef(r1)[2])*r1$model[,2], as.numeric(coef(r2)[2])*r2$model[,2]) data1$adjrate <- data1$adjinc+data1$logRate assmkp <- ggplot(data1, aes(x=logAs, y=adjrate, shape=factor(smkgrp), color=factor(smkgrp))) assmkp + stat_smooth(method = "glm", level=0.95, alpha=1, fill="grey80", color="black") + #scale_color_manual(values=c("grey50","grey70")) + geom_point(aes(size=weight)) + geom_point() + opts(legend.position = "right") + theme_bw() ################################# ## West Virginia / Kentucky Analyses ## ################################# ## Create WV/KY set via FIPS (WV=54, KY=21) wvky <- full[full$SFIPS==54 | full$SFIPS==21,] ## Bivariate, National glm1 <- glm(full$AdjRate ~ full$lnAs, family=poisson, weights=as.numeric(full$Population)) summary(glm1) plot(full$AdjRate, full$lnAs, xlab="Cancer Incidence per 100,000", ylab="As [ppm]") abline(a=glm1$coef[1], b=glm1$coef[2], col=1) ## Adjusted for Smoking, National glm2 <- glm(full$AdjRate ~ full$smkrate + full$Ascounty, family=poisson, weights=as.numeric(full$Population)) summary(glm2) ## Adjusted for Smoking, Income, National glm3 <- glm(full$AdjRate ~ full$smkrate + full$Ascounty + full$MedIncome, family=poisson, weights=as.numeric(full$Population)) summary(glm3) ## GGplot full$wvkyid <- ifelse(full$SFIPS==54 | full$SFIPS==21, 1, 0) cols <- c("0" = "red","1" = "blue") p1 <- ggplot(full, aes(y=AdjRate, x=log(Ascounty), weight=Population, colour=full$wvkyid)) p1 + stat_smooth(method=glm, level=0.95, alpha=1, fill="grey80", color="black") + geom_point() + scale_colour_manual(values = cols, limits = c("0", "1")) + theme_bw() ## Compare residuals for predicting on WVKY using the National fully controlled model pred1 <- predict(glm3, full) hist(pred1) diffresid <- pred1 - log(full$AdjRate) resids <- data.frame(as.character(full$County), pred1, full$AdjRate, diffresid) qplot(diffresid[full$wvkyid==1], data=resids, geom="histogram") + theme_bw() ## Determine the concentration of each heavy metal in KY|WV and !KY&!WV ## Average County averages for both states ## Arsenic KYWVas <- mean(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$Ascounty)) KYWVas notKYWVas <- mean(na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$Ascounty)) notKYWVas t.test(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$Ascounty), na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$Ascounty)) ## Smoking Prevalence KYWVsmk <- mean(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$smkrate)) KYWVsmk notKYWVsmk <- mean(na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$smkrate)) notKYWVsmk t.test(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$smkrate), na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$smkrate)) ## Median Income KYWVmed <- mean(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$MedIncome)) KYWVmed notKYWVmed <- mean(na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$MedIncome)) notKYWVmed t.test(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$MedIncome), na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$MedIncome)) ## Lung cancer incidence KYWVrate <- mean(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$AdjRate)) KYWVrate notKYWVrate <- mean(na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$AdjRate)) notKYWVrate t.test(na.omit(full[full$SFIPS==21 | full$SFIPS==54,]$AdjRate), na.omit(full[full$SFIPS!=21 & full$SFIPS!=54,]$AdjRate)) ## Draw a bar chart of the values used in the T-tests bardata <- c(notKYWVas, KYWVas, notKYWVsmk*100, KYWVsmk*100, notKYWVmed/1000, KYWVmed/1000, notKYWVrate, KYWVrate) barcats <- c("Arsenic", "Arsenic", "Smoking", "Smoking", "Income", "Income", "Rate", "Rate") barid <- c("Other", "WV/KY", "Other", "WV/KY", "Other", "WV/KY", "Other", "WV/KY") bardata1 <- data.frame(barid, barcats, bardata) names(bardata1) <- c("ID", "Category", "Values") bar1 <- ggplot(bardata1, aes(Category, fill=ID, y=Values)) bar1 + geom_bar(position="dodge", stat="identity") + theme_bw() ## END of code