S5 Appendix. THE MAKING OF A (DOG) MOVIE STAR (CODE). #### MULTIPLE REGRESSION ##### ##LOAD PACKAGES## library(tidyverse) #for data manipulation library(ggplot2) #for creating plots library(lm.beta) #for regression analysis library(corpcor) #for rotation library(GPArotation) #for rotation library(psych) #for the factor analysis library(nFactors) #to determine number of factors to retain library(dplyr) library(tidyr) library(pander) #to format outputs that can be used in docs library(forcats) library(lmtest) #to test for independence library(car) #to run the VIF test, QQPlot and Levene's test library(boot) #to run bootstrapping library(apaTables) #to create APA format publication tables library(forecast) library(gridExtra) ##SET WORKING DIRECTORY setwd(" ") ##LOAD DATA movie.list <- read_csv("movie_list_v2.csv") character.list <- read_csv("character_list_v2.csv") final.scores.no.rereleases <- read_csv("final_score_v6_without_rereleases.csv") final.scores.rereleases <- read_csv("final_score_v6_with_rereleases.csv") ######### 1 YEAR TRENDS ######### #### WITH RERELEASES INCLUDED ## MODELS Y1.rerelease.1 <- lm(Y1 ~ western_ideals, final.scores.rereleases) Y1.rerelease.2 <- lm(Y1 ~ western_ideals + dog_hero, final.scores.rereleases) Y1.rerelease.3 <- lm(Y1 ~ western_ideals + dog_hero + anthro, final.scores.rereleases) Y1.rerelease.4 <- lm(Y1 ~ western_ideals + dog_hero + anthro + nature_society, final.scores.rereleases) ## COMPARE MODELS anova(Y1.rerelease.1, Y1.rerelease.2, Y1.rerelease.3, Y1.rerelease.4) # Model 2 is the best to use ## RESULTS summary(Y1.rerelease.3) lm.beta(Y1.rerelease.2) # for standardised coefficients confint(Y1.rerelease.2) # confidence intervals stat.desc(final.scores.rereleases$Y1) ------------------------------------------------------------------------------------------------------------- #### WITH RERELEASES NOT INCLUDED ## MODELS Y1.no.rerelease.1 <- lm(Y1 ~ dog_hero, final.scores.no.rereleases) Y1.no.rerelease.2 <- lm(Y1 ~ dog_hero + anthro, final.scores.no.rereleases) Y1.no.rerelease.3 <- lm(Y1 ~ dog_hero + anthro + western_ideals, final.scores.no.rereleases) Y1.no.rerelease.4 <- lm(Y1 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.no.rereleases) ## COMPARE MODELS anova(Y1.no.rerelease.1, Y1.no.rerelease.2, Y1.no.rerelease.3, Y1.no.rerelease.4) # Model 2 is the best to use ## RESULTS summary(Y1.no.rerelease.2) lm.beta(Y1.no.rerelease.2) # for standardised coefficients confint(Y1.no.rerelease.2) # confidence intervals ######### 2 YEAR TRENDS ######### #### WITH RERELEASES INCLUDED ## MODELS Y2.rerelease.1 <- lm(Y2 ~ dog_hero, final.scores.rereleases) Y2.rerelease.2 <- lm(Y2 ~ dog_hero + anthro, final.scores.rereleases) Y2.rerelease.3 <- lm(Y2 ~ dog_hero + anthro + western_ideals, final.scores.rereleases) Y2.rerelease.4 <- lm(Y2 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.rereleases) ## COMPARE MODELS anova(Y2.rerelease.1, Y2.rerelease.2, Y2.rerelease.3, Y2.rerelease.4) # Model 2 is the best to use ## RESULTS summary(Y2.rerelease.2) lm.beta(Y2.rerelease.2) # for standardised coefficients confint(Y2.rerelease.2) # confidence intervals ------------------------------------------------------------------------------------------------------------- #### WITH RERELEASES NOT INCLUDED ## MODELS Y2.no.rerelease.1 <- lm(Y2 ~ dog_hero, final.scores.no.rereleases) Y2.no.rerelease.2 <- lm(Y2 ~ dog_hero + anthro, final.scores.no.rereleases) Y2.no.rerelease.3 <- lm(Y2 ~ dog_hero + anthro + western_ideals, final.scores.no.rereleases) Y2.no.rerelease.4 <- lm(Y2 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.no.rereleases) ## COMPARE MODELS anova(Y2.no.rerelease.1, Y2.no.rerelease.2, Y2.no.rerelease.3, Y2.no.rerelease.4) # Model 2 is the best to use ## RESULTS summary(Y2.no.rerelease.2) lm.beta(Y2.no.rerelease.2) # for standardised coefficients confint(Y2.no.rerelease.2) # confidence intervals ######### 5 YEAR TRENDS ######### #### WITH RERELEASES INCLUDED ## CALCULATE SAMPLE SIZE sum(!is.na (final.scores.rereleases$Y5)) ## MODELS Y5.rerelease.1 <- lm(Y5 ~ dog_hero, final.scores.rereleases) Y5.rerelease.2 <- lm(Y5 ~ dog_hero + anthro, final.scores.rereleases) Y5.rerelease.3 <- lm(Y5 ~ dog_hero + anthro + western_ideals, final.scores.rereleases) Y5.rerelease.4 <- lm(Y5 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.rereleases) ## COMPARE MODELS anova(Y5.rerelease.1, Y5.rerelease.2, Y5.rerelease.3, Y5.rerelease.4) # Model 2 is the best to use ## RESULTS summary(Y5.rerelease.2) lm.beta(Y5.rerelease.2) # for standardised coefficients confint(Y5.rerelease.2) # confidence intervals ------------------------------------------------------------------------------------------------------------- #### WITH RERELEASES NOT INCLUDED ## CALCULATE SAMPLE SIZE sum(!is.na (final.scores.no.rereleases$Y5)) #66 ## CREATE NEW DATAFRAME WITH NA SCORES REMOVED final.scores.no.rereleases.Y5 <- final.scores.no.rereleases %>% select(1:ncol(final.scores.no.rereleases)) %>% filter(., !is.na(Y5)) View(final.scores.no.rereleases.Y5) ## MODELS Y5.no.rerelease.1 <- lm(Y5 ~ dog_hero, final.scores.no.rereleases.Y5) Y5.no.rerelease.2 <- lm(Y5 ~ dog_hero + anthro, final.scores.no.rereleases.Y5) Y5.no.rerelease.3 <- lm(Y5 ~ dog_hero + anthro + western_ideals, final.scores.no.rereleases.Y5) Y5.no.rerelease.4 <- lm(Y5 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.no.rereleases.Y5) ## COMPARE MODELS anova(Y5.no.rerelease.1, Y5.no.rerelease.2, Y5.no.rerelease.3, Y5.no.rerelease.4) # Model 2 is the best to use ## RESULTS summary(Y5.no.rerelease.2) lm.beta(Y5.no.rerelease.2) # for standardised coefficients confint(Y5.no.rerelease.2) # confidence intervals ######### 10 YEAR TRENDS ######### #### WITH RERELEASES INCLUDED ## CALCULATE SAMPLE SIZE sum(!is.na (final.scores.rereleases$Y10)) #74 ## CREATE NEW DATAFRAME final.scores.rereleases.Y10 <- final.scores.rereleases %>% select(1:ncol(final.scores.rereleases)) %>% filter(., !is.na(Y10)) ## MODELS Y10.rerelease.1 <- lm(Y10 ~ dog_hero, final.scores.rereleases.Y10) Y10.rerelease.2 <- lm(Y10 ~ dog_hero + anthro, final.scores.rereleases.Y10) Y10.rerelease.3 <- lm(Y10 ~ dog_hero + anthro + western_ideals, final.scores.rereleases.Y10) Y10.rerelease.4 <- lm(Y10 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.rereleases.Y10) ## COMPARE MODELS anova(Y10.rerelease.1, Y10.rerelease.2, Y10.rerelease.3, Y10.rerelease.4) # Results are not significant so using 4th model to report results ## RESULTS summary(Y10.rerelease.4) lm.beta(Y10.rerelease.4) # for standardised coefficients confint(Y10.rerelease.4) # confidence intervals ------------------------------------------------------------------------------------------------------------- #### WITH RERELEASES NOT INCLUDED ## CALCULATE SAMPLE SIZE sum(!is.na (final.scores.no.rereleases$Y10)) #50 ## CREATE NEW DATASET WITH NA SCORES REMOVED final.scores.no.rereleases.Y10 <- final.scores.no.rereleases %>% select(1:ncol(final.scores.no.rereleases)) %>% filter(., !is.na(Y10)) View(final.scores.no.rereleases.Y10) ## MODELS Y10.no.rerelease.1 <- lm(Y10 ~ dog_hero, final.scores.no.rereleases.Y10) Y10.no.rerelease.2 <- lm(Y10 ~ dog_hero + anthro, final.scores.no.rereleases.Y10) Y10.no.rerelease.3 <- lm(Y10 ~ dog_hero + anthro + western_ideals, final.scores.no.rereleases.Y10) Y10.no.rerelease.4 <- lm(Y10 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.no.rereleases.Y10) ## COMPARE MODELS anova(Y10.no.rerelease.1, Y10.no.rerelease.2, Y10.no.rerelease.3, Y10.no.rerelease.4) # No results are significant so diagnostics run on model 4 ## RESULTS summary(Y10.no.rerelease.4) lm.beta(Y10.no.rerelease.4) # for standardised coefficients confint(Y10.no.rerelease.4) # confidence intervals ##END ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ###### TESTING ASSUMPTIONS FOR MULTIPLE REGRESSION MODELS ###### ## LOAD PACKAGES library(tidyverse) #for data manipulation library(ggplot2) #for creating plots library(lm.beta) #for regression analysis library(corpcor) #for rotation library(GPArotation) #for rotation library(psych) #for the factor analysis library(nFactors) #to determine number of factors to retain library(dplyr) library(tidyr) library(pander) #to format outputs that can be used in docs library(forcats) library(lmtest) #to test for independence library(car) #to run the VIF test, QQPlot and Levene's test library(boot) #to run bootstrapping library(apaTables) #to create APA format publication tables library(forecast) #to run autocorrelation diagnostic tests library(gridExtra) #to put all grids together ## SET WORKING DIRECTORY setwd(" ") ## LOAD DATA movie.list <- read_csv("movie_list_v2.csv") character.list <- read_csv("character_list_v2.csv") final.scores.rereleases <- read_csv("final_score_v6_with_rereleases.csv") final.scores.rereleases.Y1 <- read_csv("final_score_v6_with_rereleases.csv") final.scores.rereleases.Y2 <- read_csv("final_score_v6_with_rereleases.csv") final.scores.rereleases.Y5 <- read_csv("final_score_v6_with_rereleases.csv") final.scores.rereleases.Y10 <- read_csv("final_score_v6_with_rereleases.csv") ## REMOVE NA FIELDS FROM FIVE AND TEN YEAR DATA final.scores.rereleases.Y5 <- final.scores.rereleases.Y5 %>% select(1:ncol(final.scores.rereleases.Y5)) %>% filter(., !is.na(Y5)) final.scores.rereleases.Y10 <- final.scores.rereleases.Y10 %>% select(1:ncol(final.scores.rereleases.Y10)) %>% filter(., !is.na(Y10)) ## MODELS # Optimal model found in multiple_regression script Y1.rerelease.2 <- lm(Y1 ~ dog_hero + anthro, final.scores.rereleases.Y1) Y2.rerelease.2 <- lm(Y2 ~ dog_hero + anthro, final.scores.rereleases.Y2) Y5.rerelease.2 <- lm(Y5 ~ dog_hero + anthro, final.scores.rereleases.Y5) Y10.rerelease.4 <- lm(Y10 ~ dog_hero + anthro + western_ideals + nature_society, final.scores.rereleases.Y10) ----------------------------------------------------------------------------------------------------------------- ######### OUTLIERS AND INFLUENCERS ######### # Create table with outlier/influencer statistics # 1 YEAR trends final.scores.rereleases.Y1$residuals <- resid(Y1.rerelease.2) final.scores.rereleases.Y1$standardized.residuals<- rstandard(Y1.rerelease.2) final.scores.rereleases.Y1$studentized.residuals<-rstudent(Y1.rerelease.2) final.scores.rereleases.Y1$cooks.distance<-cooks.distance(Y1.rerelease.2) final.scores.rereleases.Y1$dfbeta<-dfbeta(Y1.rerelease.2) final.scores.rereleases.Y1$dffit<-dffits(Y1.rerelease.2) final.scores.rereleases.Y1$leverage<-hatvalues(Y1.rerelease.2) final.scores.rereleases.Y1$covariance.ratios<-covratio(Y1.rerelease.2) final.scores.rereleases.Y1$fitted <- Y1.rerelease.2$fitted.values final.scores.rereleases.Y1$NoVariance <- 1 View(final.scores.rereleases.Y1) # 2 YEAR TRENDS final.scores.rereleases.Y2$residuals <- resid(Y2.rerelease.2) final.scores.rereleases.Y2$standardized.residuals<- rstandard(Y2.rerelease.2) final.scores.rereleases.Y2$studentized.residuals<-rstudent(Y2.rerelease.2) final.scores.rereleases.Y2$cooks.distance<-cooks.distance(Y2.rerelease.2) final.scores.rereleases.Y2$dfbeta<-dfbeta(Y2.rerelease.2) final.scores.rereleases.Y2$dffit<-dffits(Y2.rerelease.2) final.scores.rereleases.Y2$leverage<-hatvalues(Y2.rerelease.2) final.scores.rereleases.Y2$covariance.ratios<-covratio(Y2.rerelease.2) final.scores.rereleases.Y2$fitted <- Y2.rerelease.2$fitted.values View(final.scores.rereleases.Y2) # 5 YEAR TRENDS final.scores.rereleases.Y5$residuals <- resid(Y5.rerelease.2) final.scores.rereleases.Y5$standardized.residuals<- rstandard(Y5.rerelease.2) final.scores.rereleases.Y5$studentized.residuals<-rstudent(Y5.rerelease.2) final.scores.rereleases.Y5$cooks.distance<-cooks.distance(Y5.rerelease.2) final.scores.rereleases.Y5$dfbeta<-dfbeta(Y5.rerelease.2) final.scores.rereleases.Y5$dffit<-dffits(Y5.rerelease.2) final.scores.rereleases.Y5$leverage<-hatvalues(Y5.rerelease.2) final.scores.rereleases.Y5$covariance.ratios<-covratio(Y5.rerelease.2) final.scores.rereleases.Y5$fitted <- Y5.rerelease.2$fitted.values View(final.scores.rereleases.Y5) # 10 YEAR TRENDS final.scores.rereleases.Y10$residuals <- resid(Y10.rerelease.4) final.scores.rereleases.Y10$standardized.residuals<- rstandard(Y10.rerelease.4) final.scores.rereleases.Y10$studentized.residuals<-rstudent(Y10.rerelease.4) final.scores.rereleases.Y10$cooks.distance<-cooks.distance(Y10.rerelease.4) final.scores.rereleases.Y10$dfbeta<-dfbeta(Y10.rerelease.4) final.scores.rereleases.Y10$dffit<-dffits(Y10.rerelease.4) final.scores.rereleases.Y10$leverage<-hatvalues(Y10.rerelease.4) final.scores.rereleases.Y10$covariance.ratios<-covratio(Y10.rerelease.4) final.scores.rereleases.Y10$fitted <- Y10.rerelease.4$fitted.values View(final.scores.rereleases.Y10) ## FIND LARGE RESIDUALS final.scores.rereleases.Y1$large.residual<-final.scores.rereleases.Y1$standardized.residuals>2 | final.scores.rereleases.Y1$standardized.residuals < -2 final.scores.rereleases.Y2$large.residual<-final.scores.rereleases.Y2$standardized.residuals>2 | final.scores.rereleases.Y2$standardized.residuals < -2 final.scores.rereleases.Y5$large.residual<-final.scores.rereleases.Y5$standardized.residuals>2 | final.scores.rereleases.Y5$standardized.residuals < -2 final.scores.rereleases.Y10$large.residual<-final.scores.rereleases.Y10$standardized.residuals>2 | final.scores.rereleases.Y10$standardized.residuals < -2 ## FIND ACCEPTED NUMBER OF RESIDUALS # 1 AND 2 YEAR TRENDS #n = 95 and 95% of scores should be within -2 and 2 95*0.05 #4.75 ##0.95 outside +/- 2.5 95*0.01 # # 5 YEAR TRENDS # 95% of scores (sum(!is.na (final.scores.rereleases.Y5$Y5)))*0.05 # 4.5 # 99% of scores (sum(!is.na (final.scores.rereleases.Y5$Y5)))*0.01 #0.9 # 10 YEAR TRENDS # 95% of scores (sum(!is.na (final.scores.rereleases.Y10$Y10)))*0.05 #3.7 # 99% of scores (sum(!is.na (final.scores.rereleases.Y10$Y10)))*0.01 #0.74 ## CALCULATE NUMBER OF RESIDUALS IN DATA sum(final.scores.rereleases.Y1$large.residual) #5 residuals sum(final.scores.rereleases.Y2$large.residual) #6 residuals sum(final.scores.rereleases.Y5$large.residual) #5 residuals sum(final.scores.rereleases.Y10$large.residual) #4 residuals ## SHOW LARGE RESIDUAL TABLES final.scores.rereleases.Y1[final.scores.rereleases.Y1$large.residual, c("movie", "year", "character", "breed", "standardized.residuals")] final.scores.rereleases.Y2[final.scores.rereleases.Y2$large.residual, c("movie", "year", "character", "breed", "standardized.residuals")] final.scores.rereleases.Y5[final.scores.rereleases.Y5$large.residual, c("movie", "year", "character", "breed", "standardized.residuals")] final.scores.rereleases.Y10[final.scores.rereleases.Y10$large.residual, c("movie", "year", "character", "breed", "standardized.residuals")] ##investigate any outside 3 further # CHECK COOK'S DISTANCE, LEVERAGE AND COVARIANCE RATIOS final.scores.rereleases.Y1[final.scores.rereleases.Y1$large.residual, c("movie", "year", "character", "breed","cooks.distance", "leverage", "covariance.ratios")] final.scores.rereleases.Y2[final.scores.rereleases.Y2$large.residual, c("movie", "year", "character", "breed","cooks.distance", "leverage", "covariance.ratios")] final.scores.rereleases.Y5[final.scores.rereleases.Y5$large.residual, c("movie", "year", "character", "breed","cooks.distance", "leverage", "covariance.ratios")] final.scores.rereleases.Y10[final.scores.rereleases.Y10$large.residual, c("movie", "year", "character", "breed","cooks.distance", "leverage", "covariance.ratios")] ##Check if cooks distance scores are below 1 to see if any cases are influencing the model ## CALCULATE AVERAGE LEVERAGE # 1 AND 2 YEAR TRENDS (2+1)/95 #0.03157895 0.03157895*2 #twice as large = 0.0631579 #check if any characters are over twice the residual 0.03157895*3 #three times as large = 0.09473685 #check if any characters are over three times the residual # 5 YEAR TRENDS (2+1)/(sum(!is.na (final.scores.rereleases.Y5$Y5))) #0.03333333 ((2+1)/(sum(!is.na (final.scores.rereleases.Y5$Y5))))*2 #twice as large = 0.06666667 #check if any characters are over twice the residual ((2+1)/(sum(!is.na (final.scores.rereleases.Y5$Y5))))*3 #three times as large = 0.1 #check if any characters are over three times the residual # 10 YEAR TRENDS (4+1)/(sum(!is.na (final.scores.rereleases.Y10$Y10))) #0.06756757 ((4+1)/(sum(!is.na (final.scores.rereleases.Y10$Y10))))*2 #twice as large = 0.1351351 #check if any characters are over twice the residual ((4+1)/(sum(!is.na (final.scores.rereleases.Y10$Y10))))*3 #three times as large = 0.09473685 #check if any characters are over three times the residual ## CALCULATE COVARIANCE RATIO BOUNDARIES #CVR > 1 + [3(k+1)/n] # ONE AND TWO YEAR TRENDS ((3/95)*3)+1 # upper boundary = 1.094737 #CVR < 1 - [3(k+1)/n] ((3/95)*3)-1 # lower boundary = -0.9052632 ## FIVE YEAR TRENDS ((3/90)*3)+1 # upper boundary = 1.1 #CVR < 1 - [3(k+1)/n] ((3/90)*3)-1 # lower boundary = -0.9 ## TEN YEAR TRENDS ((5/74)*3)+1 # upper boundary = 1.202703 #CVR < 1 - [3(k+1)/n] ((5/74)*3)-1 # lower boundary = -0.7972973 -------------------------------------------------------------------------------------------------------------------- ##### TESTING ASSUMPTIONS OF INDEPENDENCE ##### # DURBIN-WATSON TEST dwtest(Y1.rerelease.2) # data: Y1.rerelease.2 # DW = 1.6331, p-value = 0.02926 # alternative hypothesis: true autocorrelation is greater than 0 dwtest(Y2.rerelease.2) # data: Y2.rerelease.2 # DW = 1.6008, p-value = 0.01998 # alternative hypothesis: true autocorrelation is greater than 0 dwtest(Y5.rerelease.2) # data: Y5.rerelease.2 # DW = 1.6551, p-value = 0.04042 # alternative hypothesis: true autocorrelation is greater than 0 dwtest(Y10.rerelease.4) # data: Y10.rerelease.4 # DW = 2.0138, p-value = 0.4931 # alternative hypothesis: true autocorrelation is greater than 0 ## ACF PLOTS, HISTOGRAM AND BREUSCH-GODFREY TEST checkresiduals(Y1.rerelease.2) checkresiduals(Y2.rerelease.2) checkresiduals(Y5.rerelease.2) checkresiduals(Y10.rerelease.4) # Box-Pierce test Box.test(residuals(Y1.rerelease.2), lag=10, fitdf=2) # data: residuals(Y1.rerelease.2) # X-squared = 7.4451, df = 8, p-value = 0.4895 Box.test(residuals(Y2.rerelease.2), lag=10, fitdf=2) # data: residuals(Y2.rerelease.2) # X-squared = 15.021, df = 8, p-value = 0.05874 Box.test(residuals(Y5.rerelease.2), lag=10, fitdf=2) # data: residuals(Y5.rerelease.2) # X-squared = 9.0335, df = 8, p-value = 0.3395 Box.test(residuals(Y10.rerelease.4), lag=10, fitdf=2) # data: residuals(Y10.rerelease.4) # X-squared = 9.0854, df = 8, p-value = 0.3351 # Box-Ljung test Box.test(residuals(Y1.rerelease.2), lag=10, fitdf=2, type = "Lj") # data: residuals(Y1.rerelease.2) # X-squared = 7.9876, df = 8, p-value = 0.4347 Box.test(residuals(Y2.rerelease.2), lag=10, fitdf=2, type = "Lj") # data: residuals(Y2.rerelease.2) # X-squared = 16.296, df = 8, p-value = 0.03833 Box.test(residuals(Y5.rerelease.2), lag=10, fitdf=2, type = "Lj") # data: residuals(Y5.rerelease.2) # X-squared = 9.6929, df = 8, p-value = 0.2872 Box.test(residuals(Y10.rerelease.4), lag=10, fitdf=2, type = "Lj") # data: residuals(Y10.rerelease.4) # X-squared = 10.211, df = 8, p-value = 0.2505 ## PUBLICATION PLOTS ## ACF PLOTS ACF.Y1 <- ggAcf(residuals(Y1.rerelease.2)) + ggtitle("ACF of residuals for One Year Changes") ACF.Y2 <- ggAcf(residuals(Y2.rerelease.2)) + ggtitle("ACF of residuals for Two Year Changes") ACF.Y5 <-ggAcf(residuals(Y5.rerelease.2)) + ggtitle("ACF of residuals for Five Year Changes") ACF.Y10 <-ggAcf(residuals(Y10.rerelease.4)) + ggtitle("ACF of residuals for Ten Year Changes") gridExtra::grid.arrange(ACF.Y1, ACF.Y2, ACF.Y5, ACF.Y10, nrow=2) ## HISTOGRAMS gghistogram(residuals(Y1.rerelease.2)) + ggtitle("Histogram of residuals") gghistogram(residuals(Y2.rerelease.2)) + ggtitle("Histogram of residuals") gghistogram(residuals(Y5.rerelease.2)) + ggtitle("Histogram of residuals") gghistogram(residuals(Y10.rerelease.4)) + ggtitle("Histogram of residuals") --------------------------------------------------------------------------------------------------------------- ##### TESTING THE ASSUMPTION OF MULTICOLLINEARITY ##### # Variation Inflation Factor (VIF) vif(Y1.rerelease.2) # dog_hero anthro # 1.168006 1.168006 vif(Y2.rerelease.2) vif(Y5.rerelease.2) # dog_hero anthro # 1.171293 1.171293 vif(Y10.rerelease.4) # dog_hero anthro western_ideals nature_society # 1.279087 1.394700 1.335717 1.061485 ## TOLERANCE 1/vif(Y1.rerelease.2) 1/vif(Y2.rerelease.2) 1/vif(Y5.rerelease.2) 1/vif(Y10.rerelease.4) # MEAN VIF mean(vif(Y1.rerelease.2)) mean(vif(Y2.rerelease.2)) mean(vif(Y5.rerelease.2)) mean(vif(Y10.rerelease.4)) ##guidelines: ##If the largest VIF is greater than 10 then there is cause for concern ##If the average VIF is greater than 1 it may be biased ##Tolerance below 0.1 indicates a serious problem ##Tolerance below 0.2 indicates a potential problem ------------------------------------------------------------------------------------------------------------------ #### TESTING RESIDUAL ASSUMPTIONS #### # INITIAL RESIDUAL PLOT TESTS plot(Y1.rerelease.2) plot(Y2.rerelease.2) plot(Y5.rerelease.2) plot(Y10.rerelease.4) # HISTOGRAM OF STUDENTIZED RESIDUALS hist(final.scores.rereleases.Y1$studentized.residuals) hist(final.scores.rereleases.Y2$studentized.residuals) hist(final.scores.rereleases.Y5$studentized.residuals) hist(final.scores.rereleases.Y10$studentized.residuals) ----------------------------------------------------------------------------------------------------------------- #### TESTING NORMALITY #### ## SKEW TESTS round(stat.desc(cbind(final.scores.rereleases.Y1$Y1, final.scores.rereleases.Y2$Y2, final.scores.rereleases.Y5$Y5, final.scores.rereleases.Y10$Y10), basic = FALSE, norm = TRUE), digits = 3) ## SHAPIRO-WILK TESTS shapiro.test(final.scores.rereleases.Y1$Y1) shapiro.test(final.scores.rereleases.Y2$Y2) shapiro.test(final.scores.rereleases.Y5$Y5) shapiro.test(final.scores.rereleases.Y10$Y10) # QQ PLOT qplot(sample = final.scores.rereleases.Y1$Y1, stat = "qq") qplot(sample = final.scores.rereleases.Y2$Y2, stat = "qq") qplot(sample = final.scores.rereleases.Y5$Y5, stat = "qq") qplot(sample = final.scores.rereleases.Y10$Y10, stat = "qq") ## CREATING PUBLICATION PLOTS # 1 YEAR PLOTS histogram.Y1 <- ggplot(final.scores.rereleases.Y1, aes(studentized.residuals)) + theme(legend.position = "none") + geom_histogram(aes(y = ..density..), colour = "black", fill = "white") + labs(x = "Studentized Residual", y = "Density") + ggtitle("Histogram for One Year Changes") hist.Y1 <- histogram.Y1 + stat_function(fun = dnorm, args = list(mean = mean(final.scores.rereleases.Y1$studentized.residuals, na.rm = TRUE)), colour = "red", size = 1) qqplot.resid.Y1 <- qplot(sample = final.scores.rereleases.Y1$studentized.residuals, stat = "qq") + labs(x = "Theoretical Values", y = "Observed Values") + ggtitle("Q-Q Plot for One Year Changes") qqplot.resid.Y1 # 2 YEAR PLOTS histogram.Y2 <- ggplot(final.scores.rereleases.Y2, aes(studentized.residuals)) + theme(legend.position = "none") + geom_histogram(aes(y = ..density..), colour = "black", fill = "white") + labs(x = "Studentized Residual", y = "Density") + ggtitle("Histogram for Two Year Changes") hist.Y2 <- histogram.Y2 + stat_function(fun = dnorm, args = list(mean = mean(final.scores.rereleases.Y2$studentized.residuals, na.rm = TRUE)), colour = "red", size = 1) qqplot.resid.Y2 <- qplot(sample = final.scores.rereleases.Y2$studentized.residuals, stat = "qq") + labs(x = "Theoretical Values", y = "Observed Values") + ggtitle("Q-Q Plot for Two Year Changes") qqplot.resid.Y2 # 5 YEAR PLOTS histogram.Y5 <- ggplot(final.scores.rereleases.Y5, aes(studentized.residuals)) + theme(legend.position = "none") + geom_histogram(aes(y = ..density..), colour = "black", fill = "white") + labs(x = "Studentized Residual", y = "Density") + ggtitle("Histogram for Five Year Changes") hist.Y5 <- histogram.Y5 + stat_function(fun = dnorm, args = list(mean = mean(final.scores.rereleases.Y5$studentized.residuals, na.rm = TRUE)), colour = "red", size = 1) qqplot.resid.Y5 <- qplot(sample = final.scores.rereleases.Y5$studentized.residuals, stat = "qq") + labs(x = "Theoretical Values", y = "Observed Values") + ggtitle("Q-Q Plot for Five Year Changes") qqplot.resid.Y5 # 10 YEAR PLOTS histogram.Y10 <- ggplot(final.scores.rereleases.Y10, aes(studentized.residuals)) + theme(legend.position = "none") + geom_histogram(aes(y = ..density..), colour = "black", fill = "white") + labs(x = "Studentized Residual", y = "Density") + ggtitle("Histogram for Ten Year Changes") hist.Y10 <- histogram.Y10 + stat_function(fun = dnorm, args = list(mean = mean(final.scores.rereleases.Y10$studentized.residuals, na.rm = TRUE)), colour = "red", size = 1) qqplot.resid.Y10 <- qplot(sample = final.scores.rereleases.Y10$studentized.residuals, stat = "qq") + labs(x = "Theoretical Values", y = "Observed Values") + ggtitle("Q-Q Plot for Ten Year Changes") qqplot.resid.Y10 # All histograms gridExtra::grid.arrange(hist.Y1, hist.Y2, hist.Y5, hist.Y10, nrow=2) # All QQ Plots gridExtra::grid.arrange(qqplot.resid.Y1, qqplot.resid.Y2, qqplot.resid.Y5, qqplot.resid.Y10, nrow=2) ------------------------------------------------------------------------------------------------------------------ ###### HOMOGENEITY OF VARIANCE ###### ## Scatterplot of studentized residuals against predicted value # 1 YEAR TRENDS scatter.Y1 <- ggplot(final.scores.rereleases.Y1, aes(fitted, studentized.residuals)) + ggtitle("Scatterplot for One Year Changes") S.Y1 <- scatter.Y1 + geom_point() + geom_smooth(method = "lm", colour = "Blue") + labs(x = "Fitted Values", y = "Studentized Residuals") # 2 YEAR TRENDS scatter.Y2 <- ggplot(final.scores.rereleases.Y2, aes(fitted, studentized.residuals)) + ggtitle("Scatterplot for Two Year Changes") S.Y2 <- scatter.Y2 + geom_point() + geom_smooth(method = "lm", colour = "Blue") + labs(x = "Fitted Values", y = "Studentized Residuals") # 5 YEAR TRENDS scatter.Y5 <- ggplot(final.scores.rereleases.Y5, aes(fitted, studentized.residuals)) + ggtitle("Scatterplot for Five Year Changes") S.Y5 <- scatter.Y5 + geom_point() + geom_smooth(method = "lm", colour = "Blue") + labs(x = "Fitted Values", y = "Studentized Residuals") # 10 YEAR TRENDS scatter.Y10 <- ggplot(final.scores.rereleases.Y10, aes(fitted, studentized.residuals)) + ggtitle("Scatterplot for Ten Year Changes") S.Y10 <- scatter.Y10 + geom_point() + geom_smooth(method = "lm", colour = "Blue") + labs(x = "Fitted Values", y = "Studentized Residuals") # All scatter plots gridExtra::grid.arrange(S.Y1, S.Y2, S.Y5, S.Y10, nrow=2) --------------------------------------------------------------------------------------------------------------- #### TESTING LINEARITY #### # 1 YEAR TRENDS P1.Y1 <- ggplot(final.scores.rereleases.Y1, aes(x=dog_hero, y=residuals)) + geom_point() + ggtitle("Scatterplot for One Year Changes") + labs(x = "Dog Hero", y = "Residuals") P2.Y1 <- ggplot(final.scores.rereleases.Y1, aes(x=anthro, y=residuals)) + geom_point() + ggtitle("Scatterplot for One Year Changes") + labs(x = "Anthropomorphism", y = "Residuals") gridExtra::grid.arrange(P1.Y1, P2.Y1, nrow=1) # 2 YEAR TRENDS P1.Y2 <- ggplot(final.scores.rereleases.Y2, aes(x=dog_hero, y=residuals)) + geom_point() + ggtitle("Scatterplot for Two Year Changes") + labs(x = "Dog Hero", y = "Residuals") P2.Y2 <- ggplot(final.scores.rereleases.Y2, aes(x=anthro, y=residuals)) + geom_point() + ggtitle("Scatterplot for Two Year Changes") + labs(x = "Anthropomorphism", y = "Residuals") gridExtra::grid.arrange(P1.Y2, P2.Y2, nrow=1) # 5 YEAR TRENDS P1.Y5 <- ggplot(final.scores.rereleases.Y5, aes(x=dog_hero, y=residuals)) + geom_point() + ggtitle("Scatterplot for Five Year Changes") + labs(x = "Dog Hero", y = "Residuals") P2.Y5 <- ggplot(final.scores.rereleases.Y5, aes(x=anthro, y=residuals)) + geom_point() + ggtitle("Scatterplot for Five Year Changes") + labs(x = "Anthropomorphism", y = "Residuals") gridExtra::grid.arrange(P1.Y1, P2.Y1,P1.Y2, P2.Y2, P1.Y5, P2.Y5, nrow = 3) # 10 YEAR TRENDS P1.Y10 <- ggplot(final.scores.rereleases.Y10, aes(x=dog_hero, y=residuals)) + geom_point() + labs(x = "Dog Hero", y = "Residuals") P2.Y10 <- ggplot(final.scores.rereleases.Y10, aes(x=anthro, y=residuals)) + geom_point() + labs(x = "Anthropomorphism", y = "Residuals") P3.Y10 <- ggplot(final.scores.rereleases.Y10, aes(x=western_ideals, y=residuals)) + geom_point() + labs(x = "Western Ideals", y = "Residuals") P4.Y10 <- ggplot(final.scores.rereleases.Y10, aes(x=nature_society, y=residuals)) + geom_point() + labs(x = "Nature/Society Boundary", y = "Residuals") #title code # + ggtitle("Scatterplot for Ten Year Changes") gridExtra::grid.arrange(P1.Y10, P2.Y10, P3.Y10, P4.Y10, nrow=2) --------------------------------------------------------------------------------------------------------------- ####### TESTING ZERO VARIANCE ####### round(apply(final.scores.rereleases.Y1, 2, var), digits = 2) which(apply(final.scores.rereleases.Y1, 2, var) == 0) ##END ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ########EXPLORATORY ANALYSIS############ ##LOAD PACKAGES## library(tidyverse) #for data manipulation library(ggplot2) #for creating plots library(lm.beta) #for regression analysis library(dplyr) library(tidyr) library(pander) #to format outputs that can be used in docs library(forcats) library(broom) library(pastecs) library(WRS2) library(clinfun) #to run jonckheere test library(gmodels) #to run the chi-square test ##SET WORKING DIRECTORY setwd(" ") ##LOAD DATA movie.list <- read_csv("movie_list_v2.csv") character.list <- read_csv("character_list_v2.csv") final.scores.no.rereleases <- read_csv("final_score_v6_without_rereleases.csv") final.scores.rereleases <- read_csv("final_score_v6_with_rereleases.csv") character.movie.ratio <- read_csv("character_movie_ratio.csv") ## JOIN TABLES final.score.all <- final.scores.rereleases %>% left_join(select(character.list, character_id, sex, availability), by = c("character_id"="character_id")) %>% left_join(select(movie.list, movie_id, decade, type, boy_and_dog), by = "movie_id") write.table(final.score.all, "Scores with all variables.csv", sep = "\t", row.names = FALSE) ## RUN FOR DUMMY CODING CATEGORICAL VARIABLES ONLY ## final.score.all.v2 <- read_csv("Scores with all variables v2.csv") final.score.all.v2$sex <- factor(final.score.all.v2$sex, levels = c(0:1), labels = c("Female", "Male")) ------------------------------------------------------------------------------------------------------------- ##### GRAPH OF MOVIES TRENDS OVER TIME ##### ggplot(final.score.all, aes(x = year, y = Y1)) + geom_point() + geom_smooth(method = "lm") ggplot(final.score.all, aes(x = year, y = Y10)) + geom_point() + geom_smooth(method = "lm") ggplot(final.score.all, aes(x = anthro, y = Y1)) + geom_point() + geom_smooth(method = "lm") ------------------------------------------------------------------------------------------------------------- ##### CHECK ASSUMPTIONS ##### ## PORTRAYAL BY SEX # Step 1: Check assumptions # Plot data ggplot(final.score.all, aes(x = sex, y = dog_hero, fill = sex)) + geom_boxplot() # error bars overlap indicating no between group differences ggplot(final.score.all, aes(x = sex, y = anthro, fill = sex)) + geom_boxplot() # error bars overlap indicating no between group differences my_comparisons <- list( c("Female", "Male")) ggplot(final.score.all, aes(x = sex, y = western_ideals, fill = sex)) + geom_boxplot() + ggtitle("Western Ideals Score by Sex of Character") + labs(x = "Sex", y = "Western Ideals", fill = "Sex of Character") + theme(plot.title = element_text(hjust = 0.5, size = 22), axis.text = element_text(size = 12, face = "bold"), axis.title = element_text(size = 12, face = "bold"), legend.title = element_text(size = 14), # legend.position = "bottom", legend.key.size = unit(1.5, "cm")) # error bars overlap indicating no between group differences ggplot(final.score.all, aes(x = sex, y = nature_society, fill = sex)) + geom_boxplot() # error bars overlap indicating no between group differences # Step 2: Check descriptive statistics by(final.score.all$dog_hero, final.score.all$sex, stat.desc, basic = FALSE, norm = TRUE) # Females p < 0.05 so assume normality but males p > 0.05 so likely not normal by(final.score.all$anthro, final.score.all$sex, stat.desc, basic = FALSE, norm = TRUE) # both p < 0.05 so cannot assume normality by(final.score.all$western_ideals, final.score.all$sex, stat.desc, basic = FALSE, norm = TRUE) # both p < 0.05 so cannot assume normality by(final.score.all$nature_society, final.score.all$sex, stat.desc, basic = FALSE, norm = TRUE) # both p < 0.05 so cannot assume normality ## PORTRAYAL BY MOVIE TYPE # Step 1: Check assumptions # Plot data ggplot(final.score.all, aes(x = type.y, y = dog_hero, fill = type.y)) + geom_boxplot() # error bars overlap indicating no between group differences ggplot(final.score.all, aes(x = type.y, y = anthro, fill = type.y)) + geom_boxplot() # error bars overlap indicating no between group differences ggplot(final.score.all, aes(x = type.y, y = western_ideals, fill = type.y)) + geom_boxplot() # error bars overlap indicating no between group differences ggplot(final.score.all, aes(x = type.y, y = nature_society, fill = type.y)) + geom_boxplot() # error bars overlap indicating no between group differences # Step 2: Check descriptive statistics by(final.score.all$dog_hero, final.score.all$type.y, stat.desc, basic = FALSE, norm = TRUE) # Animation p < 0.05 so assume normality but Live Action p > 0.05 so likely not normal by(final.score.all$anthro, final.score.all$type.y, stat.desc, basic = FALSE, norm = TRUE) # both p < 0.05 so cannot assume normality by(final.score.all$western_ideals, final.score.all$type.y, stat.desc, basic = FALSE, norm = TRUE) # both p < 0.05 so cannot assume normality by(final.score.all$nature_society, final.score.all$type.y, stat.desc, basic = FALSE, norm = TRUE) # both p < 0.05 so cannot assume normality ---------------------------------------------------------------------------------------------------------------- ###### NON-PARAMETRIC TESTS ###### ## FUNCTION rFromWileox <- function(wilcoxModel, N) { z <- qnorm(wilcoxModel$p.value/2) r <- z/sqrt(N) cat(wilcoxModel$data.name, "Effect Size, r = ", r) } ###### PORTRAYAL BY SEX ###### ## DOG HERO MODELS ## dog.hero.sex <- wilcox.test(dog_hero ~ sex, final.score.all) dog.hero.sex final.score.all %>% group_by(sex) %>% summarise(dog_hero_median = median(dog_hero,na.rm = TRUE), dog_hero_ideals_sd = sd(dog_hero,na.rm = TRUE)) rFromWileox(dog.hero.sex, 95) ## ANTHRO MODELS ## anthro.sex <- wilcox.test(anthro ~ sex, final.score.all) anthro.sex final.score.all %>% group_by(sex) %>% summarise(median = median(anthro,na.rm = TRUE), sd = sd(anthro,na.rm = TRUE)) rFromWileox(anthro.sex, 95) ## WESTERN IDEALS ## western.ideals.sex <- wilcox.test(western_ideals ~ sex, final.score.all) western.ideals.sex final.score.all %>% group_by(sex) %>% summarise(median = median(western_ideals,na.rm = TRUE), sd = sd(western_ideals,na.rm = TRUE)) rFromWileox(western.ideals.sex, 95) ## NATURE/SOCIETY BOUNDARY MODELS ## nature.society.sex <- wilcox.test(nature_society ~ sex, final.score.all) nature.society.sex final.score.all %>% group_by(sex) %>% summarise(median = median(nature_society,na.rm = TRUE), sd = sd(nature_society,na.rm = TRUE)) rFromWileox(nature.society.sex, 95) --------------------------------------------------------------------------------------------------------------- ###### PORTRAYAL BY MOVIE TYPE ####### ## DOG HERO MODELS ## dog.hero.type <- wilcox.test(dog_hero ~ type.y, final.score.all) dog.hero.type final.score.all %>% group_by(type.y) %>% summarise(mean = mean(dog_hero, na.rm = TRUE), sd = sd(dog_hero, na.rm = TRUE)) rFromWileox(dog.hero.type, 95) ## ANTHRO MODELS ## anthro.type <- wilcox.test(anthro ~ type.y, final.score.all) anthro.type final.score.all %>% group_by(type.y) %>% summarise(mean = mean(anthro,na.rm = TRUE), sd = sd(anthro,na.rm = TRUE)) rFromWileox(anthro.type, 95) ## WESTERN IDEALS ## western.ideals.type <- wilcox.test(western_ideals ~ type.y, final.score.all) western.ideals.type final.score.all %>% group_by(type.y) %>% summarise(mean = mean(western_ideals,na.rm = TRUE), sd = sd(western_ideals,na.rm = TRUE)) rFromWileox(western.ideals.type, 95) ## NATURE/SOCIETY BOUNDARY ## nature.society.type <- wilcox.test(nature_society ~ type.y, final.score.all) nature.society.type final.score.all %>% group_by(type.y) %>% summarise(mean = mean(nature_society,na.rm = TRUE), sd = sd(nature_society,na.rm = TRUE)) rFromWileox(nature.society.type, 95) ------------------------------------------------------------------------------------------------------------- ##### SCORE CHANGES ACROSS TIME ###### ## DOG HERO MODELS ## dog.hero.year <- lm(dog_hero ~ year, final.score.all) summary(dog.hero.year) lm.beta(dog.hero.year) ## ANTHRO MODELS ## anthro.year <- lm(anthro ~ year, final.score.all) summary(anthro.year) lm.beta(anthro.year) ## WESTERN IDEALS ## western.ideals.year <- lm(western_ideals ~ year, final.score.all) summary(western.ideals.year) lm.beta(western.ideals.year) ## NATURE/SOCIETY BOUNDARY ## nature_society.year <- lm(nature_society ~ year, final.score.all) summary(nature_society.year) lm.beta(nature_society.year) ------------------------------------------------------------------------------------------------------------- ##### SCORE CHANGES ACROSS DECADES ###### ## DOG HERO MODELS ## ggplot(final.score.all, aes(x = decade, y = dog_hero)) + geom_point(colour = "red") + geom_smooth(method = "lm") aov(dog_hero ~ decade, final.score.all) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 0.138 0.138 2.78 0.0990 # 2 Residuals 93 4.63 0.0497 NA NA # Mostly stays the same but overall downward trend # A resurgance in the 1990s pairwise.t.test(final.score.all$dog_hero, final.score.all$decade, p.adjust.method = "bonferroni") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1.00 - - - - - - # 1950 1.00 1.00 - - - - - # 1960 1.00 1.00 1.00 - - - - # 1970 0.50 1.00 1.00 1.00 - - - # 1980 1.00 1.00 1.00 1.00 1.00 - - # 1990 1.00 1.00 1.00 1.00 0.81 1.00 - # 2000 1.00 1.00 1.00 1.00 1.00 1.00 1.00 # # P value adjustment method: bonferroni ## ANTHRO MODELS ## aov(anthro ~ decade, final.score.all) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 0.176 0.176 2.61 0.110 # 2 Residuals 93 6.28 0.0675 NA NA ggplot(final.score.all, aes(x = decade, y = anthro)) + geom_point(colour = "red") + geom_smooth(method = "lm") pairwise.t.test(final.score.all$anthro, final.score.all$decade, p.adjust.method = "BH") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1.000 - - - - - - # 1950 1.000 1.000 - - - - - # 1960 1.000 1.000 1.000 - - - - # 1970 1.000 1.000 1.000 0.262 - - - # 1980 0.841 0.796 1.000 1.000 0.086 - - # 1990 1.000 1.000 1.000 1.000 1.000 1.000 - # 2000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 # P value adjustment method: bonferroni ## WESTERN IDEALS MODELS ## aov(western_ideals ~ decade, final.score.all.v2) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 0.0755 0.0755 1.21 0.273 # 2 Residuals 93 5.79 0.0622 NA NA ggplot(final.score.all, aes(x = decade, y = western_ideals)) + geom_point(colour = "red") + geom_smooth(method = "lm") pairwise.t.test(final.score.all$western_ideals, final.score.all$decade, p.adjust.method = "bonferroni") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1.00 - - - - - - # 1950 1.00 1.00 - - - - - # 1960 1.00 0.79 1.00 - - - - # 1970 1.00 1.00 1.00 1.00 - - - # 1980 1.00 1.00 1.00 1.00 1.00 - - # 1990 1.00 0.90 1.00 1.00 1.00 1.00 - # 2000 1.00 1.00 1.00 1.00 1.00 1.00 1.00 # # P value adjustment method: bonferroni ## NATURE/SOCIETY BOUNDARY ## aov(nature_society ~ decade, final.score.all) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 0.00731 0.00731 0.389 0.534 # 2 Residuals 93 1.75 0.0188 NA NA ggplot(final.score.all, aes(x = decade, y = nature_society)) + geom_point(colour = "red") + geom_smooth(method = "lm") pairwise.t.test(final.score.all$nature_society, final.score.all$decade, p.adjust.method = "bonferroni") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1.00 - - - - - - # 1950 1.00 1.00 - - - - - # 1960 1.00 1.00 1.00 - - - - # 1970 1.00 1.00 1.00 1.00 - - - # 1980 1.00 1.00 1.00 1.00 1.00 - - # 1990 1.00 1.00 1.00 1.00 1.00 0.34 - # 2000 1.00 1.00 1.00 1.00 1.00 1.00 1.00 # # P value adjustment method: bonferroni --------------------------------------------------------------------------------------------------------------- ##### TREND CHANGE BY DECADE ##### ## ONE YEAR CHANGES ## aov(Y1 ~ decade, final.score.all) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 335. 335. 8.11 0.00541 # 2 Residuals 93 3840. 41.3 NA NA pairwise.t.test(final.score.all$Y1, final.score.all$decade, p.adjust.method = "bonferroni") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1.000 - - - - - - # 1950 1.000 1.000 - - - - - # 1960 0.636 1.000 1.000 - - - - # 1970 0.045 1.000 1.000 1.000 - - - # 1980 0.133 1.000 1.000 1.000 1.000 - - # 1990 0.016 1.000 1.000 1.000 1.000 1.000 - # 2000 0.362 1.000 1.000 1.000 1.000 1.000 1.000 # # P value adjustment method: bonferroni ggplot(final.score.all, aes(x = decade, y = Y1)) + geom_point(colour = "red") + geom_smooth(method = "lm") ## TWO YEAR CHANGES ## aov(Y2 ~ decade, final.score.all) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 301. 301. 2.92 0.0909 # 2 Residuals 93 9582. 103. NA NA pairwise.t.test(final.score.all$Y2, final.score.all$decade, p.adjust.method = "bonferroni") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1.000 - - - - - - # 1950 0.460 1.000 - - - - - # 1960 0.642 1.000 1.000 - - - - # 1970 0.187 1.000 1.000 1.000 - - - # 1980 0.080 1.000 1.000 1.000 1.000 - - # 1990 0.019 1.000 1.000 1.000 1.000 1.000 - # 2000 1.000 1.000 1.000 1.000 1.000 1.000 0.569 # # P value adjustment method: bonferroni aov(Y5 ~ decade, final.score.all) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 829. 829. 4.44 0.0379 # 2 Residuals 88 16430. 187. NA NA pairwise.t.test(final.score.all$Y5, final.score.all$decade, p.adjust.method = "bonferroni") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1.000 - - - - - - # 1950 1.000 1.000 - - - - - # 1960 1.000 1.000 1.000 - - - - # 1970 1.000 1.000 1.000 1.000 - - - # 1980 1.000 1.000 1.000 1.000 1.000 - - # 1990 0.432 0.096 1.000 0.114 1.000 1.000 - # 2000 1.000 1.000 1.000 1.000 1.000 1.000 0.214 # # P value adjustment method: bonferroni aov(Y10 ~ decade, final.score.all) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 1072. 1072. 4.53 0.0368 # 2 Residuals 72 17048. 237. NA NA pairwise.t.test(final.score.all$Y10, final.score.all$decade, p.adjust.method = "bonferroni") #No values... ------------------------------------------------------------------------------------------------------------- ###### CHANGE OF SEX ACROSS DECADES ####### aov(sex ~ decade, final.score.all.v2) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 0.0750 0.0750 0.402 0.528 # 2 Residuals 93 17.4 0.187 NA NA pairwise.t.test(final.score.all.v2$sex, final.score.all.v2$decade, p.adjust.method = "bonferroni") # 1930 1940 1950 1960 1970 1980 1990 # 1940 1 - - - - - - # 1950 1 1 - - - - - # 1960 1 1 1 - - - - # 1970 1 1 1 1 - - - # 1980 1 1 1 1 1 - - # 1990 1 1 1 1 1 1 - # 2000 1 1 1 1 1 1 1 # # P value adjustment method: bonferroni ------------------------------------------------------------------------------------------------------------- ####### CHANGE OF SEX ACROSS TIME ######## sex.year.model <- lm(sex ~ year, final.score.all.v2) summary(sex.year.model) ------------------------------------------------------------------------------------------------------------- ####### CHANGE OF CHARACTERS' ANALYSED ACROSS DECADE ######## ggplot(final.score.all, aes()) aov(ratio ~ decade, character.movie.ratio) %>% tidy() # term df sumsq meansq statistic p.value # # 1 decade 1 1.10 1.10 11.0 0.0160 # 2 Residuals 6 0.599 0.0998 NA NA pairwise.t.test(character.movie.ratio$ratio, character.movie.ratio$decade, p.adjust.method = "BH") final.score.all$ratio <- ## Change the decades that you want to compare before running filter <- character.movie.ratio %>% filter(.$decade == 1930 | .$decade == 1940) ratio.model <- wilcox.test(ratio ~ decade, character.movie.ratio) ratio.model final.score.all %>% group_by(sex) %>% summarise(dog_hero_mean = mean(dog_hero,na.rm = TRUE), dog_hero_ideals_sd = sd(dog_hero,na.rm = TRUE)) rFromWileox(dog.hero.sex, 95) ### END ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------- ##### PUBLICATION FIGURES v2 #### See https://bbc.github.io/rcookbook/ for code to create BBC style graphs ## LOAD PACKAGES ## if(!require(pacman))install.packages("pacman") pacman::p_load('dplyr', 'tidyr', 'gapminder', 'ggplot2', 'ggalt', 'forcats', 'R.utils', 'png', 'grid', 'ggpubr', 'scales', 'bbplot') install.packages('devtools') install.packages("ggpubr") install.packages("ggsignif") devtools::install_github('bbc/bbplot') library(devtools) library(bbplot) library(tidyverse) #for data manipulation library(ggplot2) #for creating plots library(dplyr) library(tidyr) library(ggpubr) library(ggsignif) ## SET WORKING DIRECTORY setwd(" ") ## LOAD DATA character.movie.ratio <- read_csv("character_movie_ratio.csv") character.sex.ratio <- read_csv("character_sex_ratio.csv") movie.list <- read_csv("movie_list_v2.csv") character.list <- read_csv("character_list_v2.csv") final.scores.no.rereleases <- read_csv("final_score_v6_without_rereleases.csv") final.scores.rereleases <- read_csv("final_score_v6_with_rereleases.csv") character.movie.ratio <- read_csv("character_movie_ratio.csv") releases.over.time <- read_csv("releases_over_time.csv") ## JOIN TABLES final.score.all <- final.scores.rereleases %>% left_join(select(character.list, character_id, sex, availability), by = c("character_id"="character_id")) %>% left_join(select(movie.list, movie_id, decade, type, boy_and_dog), by = "movie_id") -------------------------------------------------------------------------------------------------------------------- #### FIGURE 1 ##### ### AVERAGE NUMBER OF CHARACTERS PER MOVIE BY DECADE #### ## To Create Plot fig1 <- ggplot(character.movie.ratio, aes(x = decade, y = ratio)) + geom_line(colour = "#1380A1", size = 1) + geom_hline(yintercept = 1.0, size = 1, colour="#333333") + scale_x_continuous(limits=c(1930,2000), breaks = seq(1930, 2000, by = 10)) + scale_y_continuous(limits=c(1.0,2.6), breaks = seq(1.0, 2.5, by = 0.5)) + bbc_style() + labs(title="Average Number of Dog Characters Analysed per Movie", subtitle = "Greater number of prominent dogs as the decade progresses") + theme(plot.title = element_text(size = 21), axis.title = element_text(size = 18)) + labs(x = "Decade", y = "Character to Movie Ratio") + geom_curve(aes(x = 1956, y = 2.4, xend = 1969, yend = 2.5), colour = "#555555", curvature = -.2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) + geom_label(aes(x = 1930, y = 2.2, label = "Peaks because of the\nsix Dobermans in\n'The Doberman Gang'"), hjust = 0, vjust = 0, colour = "#555555", fill = "white", label.size = NA, size = 6) fig1 ## To Save Plot finalise_plot(plot_name = fig1, source = "The Making of a (Dog) Movie Star", save_filepath = " ", width_pixels = 640, height_pixels = 450) --------------------------------------------------------------------------------------------------------------------- ###### FIGURE 2 ####### ## % of Sex of Characters across Decades fig2 <- ggplot(character.sex.ratio, aes(x = decade, y = per_char, colour = sex)) + geom_line(size = 1) + geom_hline(yintercept = 0, size = 1, colour="#333333") + scale_colour_manual(values = c("#FAAB18", "#1380A1")) + scale_x_continuous(limits=c(1930,2010), breaks = seq(1930, 2000, by = 10)) + scale_y_continuous(limits=c(0,100), labels = function(y) paste0(y, " %")) + bbc_style() + labs(title="Percentage of Characters' Sex Across Decades", subtitle = "Prominent dog characters are disproportionately male") + theme(plot.title = element_text(size = 21), legend.position = "none", axis.title = element_text(size = 18)) + labs(x = "Decade", y = "% of sex for characters in each decade") +) + geom_label(aes(x = 2003, y = 66.7, label = "Male"), hjust = 0, vjust = 0.5, colour = "#1380A1", fill = "white", label.size = NA, family="Helvetica", size = 6) + geom_label(aes(x = 2003, y = 33.3, label = "Female"), hjust = 0, vjust = 0.5, colour = "#FAAB18", fill = "white", label.size = NA, family="Helvetica", size = 6) fig2 finalise_plot(plot_name = fig2, source = "The Making of a (Dog) Movie Star", save_filepath = " ", width_pixels = 640, height_pixels = 450) --------------------------------------------------------------------------------------------------------------- ###### FIGURE 3 ###### ### Western Ideals Score by Sex of Character my_comparisons <- list( c("Male", "Female")) fig3 <- ggplot(final.score.all, aes(x = sex, y = western_ideals, fill = sex)) + geom_boxplot() + geom_hline(yintercept = 0, size = 1, colour="#333333") + scale_fill_manual(values = c("#FAAB18", "#1380A1")) + scale_y_continuous(limits=c(0,1.1), labels = c("0%","25%", "50%", "75%", "100%"), breaks = seq(0, 1.1, by = 0.25)) + bbc_style() + labs(title = "Western Ideals Score by Character's Sex", subtitle = "Female characters are more likely to portray Western Ideals") + theme(legend.position = "none") + geom_signif(comparisons = list(c("Female", "Male")), map_signif_level = TRUE, textsize=7) + geom_label(aes(x = 2.15, y = 1, label = "Dipstick"), hjust = 0, vjust = 0.5, colour = "#1380A1", fill = "white", label.size = NA, family="Helvetica", size = 6) + geom_label(aes(x = 2.15, y = 0.78, label = "Pongo (1961)"), hjust = 0, vjust = 0.5, colour = "#1380A1", fill = "white", label.size = NA, family="Helvetica", size = 6) + geom_label(aes(x = 1.3, y = 0.89, label = "Pongo (1996)"), hjust = 0, vjust = 0.5, colour = "#1380A1", fill = "white", label.size = NA, family="Helvetica", size = 6) + geom_curve(aes(x = 2.14, y = 1, xend = 2.03, yend = 1), colour = "#555555", curvature = .2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) + geom_curve(aes(x = 2.14, y = 0.78, xend = 2.03, yend = 0.78), colour = "#555555", curvature = -.2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) + geom_curve(aes(x = 1.75, y = 0.89, xend = 1.95, yend = 0.89), colour = "#555555", curvature = -.2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) fig3 finalise_plot(plot_name = fig3, source = "The Making of a (Dog) Movie Star", save_filepath = " ", width_pixels = 640, height_pixels = 450) ---------------------------------------------------------------------------------------------------------------- ##### FIGURE 4 ##### ### Rereleases over time ## Lady and the Tramp Results #Prepare Data lady.rereleases <- gather(final.score.all,key = "time_period",value = "score", Y1, Y2, Y5, Y10) %>% arrange(year) %>% filter(movie == "Lady and the Tramp" & breed == "Spaniel (All Cockers)") lady.rereleases$year <- as.character(as.numeric(lady.rereleases$year)) is.character(lady.rereleases$year) lady.rereleases$time_period<- factor(lady.rereleases$time_period, levels = c("Y1","Y2","Y5", "Y10")) #Plot graph fig4 <- ggplot(lady.rereleases, aes(x = year, y = score, fill = as.factor(time_period))) + geom_bar(stat="identity", position="dodge") + geom_hline(yintercept = 0, size = 1, colour="#333333") + scale_y_continuous(limits=c(-15,20), breaks = seq(-15, 20, by = 10)) + bbc_style() + scale_fill_manual(values = c("#FAAB18", "#1380A1","#990000", "#588300"), labels = c("1 Year Changes", "2 Year Changes", "5 Year Changes", "10 Year Changes")) + labs(title="Cocker Spaniel Registration Changes Relative to all AKC\nRegistrations for each Rerelease of 'Lady and the Tramp'") + theme(plot.title = element_text(size = 21), legend.position = "bottom", legend.text=element_text(size = 14), axis.title = element_text(size = 16)) + labs(x = "Theatre Release Year", y = "Registration Changes\nRelative to all Breeds") + geom_curve(aes(x = 0.6, y = 11, xend = 0.9, yend = 5), colour = "#555555", curvature = -.2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) + geom_label(aes(x = 0.115, y = 11.5, label = "Original Release"), hjust = 0, vjust = 0, colour = "#555555", fill = "white", label.size = NA, size = 6) + geom_curve(aes(x = 4.5, y = 11, xend = 4.2, yend = 2), colour = "#555555", curvature = -.2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) + geom_label(aes(x = 3.6, y = 11.5, label = "Largest Box Office Intake\nof all releases"), hjust = 0, vjust = 0, colour = "#555555", fill = "white", label.size = NA, size = 6) fig4 finalise_plot(plot_name = fig4, source = "The Making of a (Dog) Movie Star", save_filepath = " ", width_pixels = 640, height_pixels = 450) ---------------------------------------------------------------------------------------------------------------- ##### FIGURE 5 ##### ### Rereleases over time ## One Hundred and One Dalmatians Results #Prepare Data Dalmatian.rereleases <- gather(final.score.all,key = "time_period",value = "score", Y1, Y2, Y5, Y10) %>% arrange(year) %>% filter(movie == "One Hundred and One Dalmatians" & breed == "Dalmatian") Dalmatian.rereleases$year <- as.character(as.numeric(Dalmatian.rereleases$year)) is.character(Dalmatian.rereleases$year) Dalmatian.rereleases$time_period<- factor(Dalmatian.rereleases$time_period, levels = c("Y1","Y2","Y5", "Y10")) #Plot graph fig5 <- ggplot(Dalmatian.rereleases, aes(x = year, y = score, fill = as.factor(time_period))) + geom_bar(stat="identity", position="dodge") + geom_hline(yintercept = 0, size = 1, colour="#333333") + # scale_y_continuous(limits=c(-15,20), # breaks = seq(-15, 20, by = 10)) + bbc_style() + scale_fill_manual(values = c("#FAAB18", "#1380A1","#990000", "#588300"), labels = c("1 Year Changes", "2 Year Changes", "5 Year Changes", "10 Year Changes")) + labs(title="Dalmatian Registration Changes Relative to all AKC\nRegistrations for each Rerelease of One Hundred and One Dalmatians") + theme(plot.title = element_text(size = 18), legend.position = "bottom", legend.text=element_text(size = 14), axis.title = element_text(size = 16)) + labs(x = "Theatre Release Year", y = "Registration Changes\nRelative to all Breeds") + geom_curve(aes(x = 0.6, y = 11, xend = 0.9, yend = 5), colour = "#555555", curvature = -.2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) + geom_label(aes(x = 0.115, y = 11.5, label = "Original Release"), hjust = 0, vjust = 0, colour = "#555555", fill = "white", label.size = NA, size = 6) + geom_curve(aes(x = 2.2, y = -10.5, xend = 1.9, yend = -1.6), colour = "#555555", curvature = -.2, size = .5, arrow = arrow(length = unit(0.03, "npc"))) + geom_label(aes(x = 2.2, y = -16, label = "Largest Box Office Intake\nof all releases"), hjust = 0, vjust = 0, colour = "#555555", fill = "white", label.size = NA, size = 6) fig5 finalise_plot(plot_name = fig5, source = "The Making of a (Dog) Movie Star", save_filepath = " ", width_pixels = 640, height_pixels = 450)