diff --git a/notebooks/code_checklist-analysis.Rmd b/notebooks/code_checklist-analysis.Rmd index 13b2f4d..a3eac9e 100644 --- a/notebooks/code_checklist-analysis.Rmd +++ b/notebooks/code_checklist-analysis.Rmd @@ -1,11 +1,13 @@ --- -title: "R Notebook" -output: html_notebook +title: "ML Code Completeness Checklist Analysis" +output: + pdf_document: default + html_notebook: default --- -ML Code Completeness analysis for NeurIPS 2019 repositories. +This notebook contains the ML Code Completeness analysis for NeurIPS 2019 repositories. -For a run & rendered version of this notebook please see: [code_checklist-analysis.nb.html](code_checklist-analysis.nb.html). +For a run & rendered version of this notebook please see: [code_checklist-analysis.pdf](code_checklist-analysis.pdf). Official repositories for NeurIPS 2019 papers fetched from: https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-2019 @@ -85,7 +87,7 @@ Fit robust regression and test significance of results print(summary(rlm(stars~training+evaluation+pretrained_model+results+dependencies, data=t))) for(i in 0:4){ - cat("Score5 vs Score", i) + cat("\nScore5 vs Score", i, "\n") print(wilcox.test(t$stars[t$score==5], t$stars[t$score==i])) } ``` diff --git a/notebooks/code_checklist-analysis.nb.html b/notebooks/code_checklist-analysis.nb.html deleted file mode 100644 index c906795..0000000 --- a/notebooks/code_checklist-analysis.nb.html +++ /dev/null @@ -1,2133 +0,0 @@ - - - - -
- - - - - - - - -ML Code Completeness analysis for NeurIPS 2019 repositories.
-For a run & rendered version of this notebook please see: code_checklist-analysis.nb.html.
-Official repositories for NeurIPS 2019 papers fetched from: https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-2019
-A random 25% sample has been selected and manually annotated according to the 5 critera of the ML Code Completness Checklist. The result has been saved into code_checklist-neurips2019.csv.
library(tidyverse)
-
-
-Registered S3 method overwritten by 'dplyr':
- method from
- print.rowwise_df
-Registered S3 methods overwritten by 'dbplyr':
- method from
- print.tbl_lazy
- print.tbl_sql
-[37m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
-[37m[32m✓[37m [34mggplot2[37m 3.2.1 [32m✓[37m [34mpurrr [37m 0.3.3
-[32m✓[37m [34mtibble [37m 2.1.3 [32m✓[37m [34mdplyr [37m 0.8.4
-[32m✓[37m [34mtidyr [37m 1.0.2 [32m✓[37m [34mstringr[37m 1.4.0
-[32m✓[37m [34mreadr [37m 1.3.1 [32m✓[37m [34mforcats[37m 0.4.0[39m
-[37m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
-[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
-
-
-library(ggplot2)
-library(MASS)
-
-
-
-Attaching package: ‘MASS’
-
-The following object is masked from ‘package:dplyr’:
-
- select
-
-
-library(RColorBrewer)
-
-t = read_csv("code_checklist-neurips2019.csv")
-
-
-Parsed with column specification:
-cols(
- url = [31mcol_character()[39m,
- stars = [32mcol_double()[39m,
- python = [32mcol_double()[39m,
- training = [32mcol_double()[39m,
- evaluation = [32mcol_double()[39m,
- pretrained_model = [32mcol_double()[39m,
- results = [32mcol_double()[39m,
- dependencies = [32mcol_double()[39m
-)
-
-
-cat("Number of rows:", nrow(t), "\n")
-
-
-Number of rows: 221
-
-
-
-We’ll focus only on Python repositories, since this is the dominant language in ML and repositories in other languages tend to have a smaller number of stars just because the community is smaller.
- - - -t = t[t$python==1,]
-cat("Number of rows:", nrow(t), "\n")
-
-
-Number of rows: 200
-
-
-
-Next, we calculate the score as a sum of of individual checklist items and calculate summary stats.
- - - -t$score = rowSums(t[,4:8])
-
-
-
-We group repositories based on their score and calculate summary stats.
- - - -
-cat("Spread of values in each group:\n")
-
-
-Spread of values in each group:
-
-
-summaries = tapply(t$stars, t$score, summary)
-names(summaries) = paste(names(summaries), "ticks")
-print(summaries)
-
-
-$`0 ticks`
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.0 0.0 1.5 14.5 10.0 89.0
-
-$`1 ticks`
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.00 0.00 5.00 11.94 11.00 59.00
-
-$`2 ticks`
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.00 4.00 15.00 43.17 30.00 654.00
-
-$`3 ticks`
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 0.00 6.00 19.00 171.15 75.75 6082.00
-
-$`4 ticks`
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 1.00 22.25 62.50 457.88 148.50 5114.00
-
-$`5 ticks`
- Min. 1st Qu. Median Mean 3rd Qu. Max.
- 16.00 93.25 196.50 2664.89 517.00 36549.00
-
-
-cat("Proportion of repos in each group:\n")
-
-
-Proportion of repos in each group:
-
-
-props = tapply(t$stars, t$score, length)
-props = props/sum(props)
-names(props) = paste(names(props), "ticks")
-print(props)
-
-
-0 ticks 1 ticks 2 ticks 3 ticks 4 ticks 5 ticks
- 0.050 0.085 0.205 0.360 0.210 0.090
-
-
-# Extract medians
-medians = unlist(lapply(tapply(t$stars, t$score, summary), function(x) x["Median"]))
-names(medians) = paste(sub(".Median", "", names(medians)), "ticks")
-
-
-
-Generate summary graphs.
- - - -par(oma=c(0,1,0,1))
-layout(matrix(c(1,2), 1, 2, byrow = TRUE), widths=c(3,2))
-barplot(medians,
- xlab="",
- ylab="Median GitHub stars", ylim=c(0,200),
- col=brewer.pal(6, "Blues"), cex.axis=0.7, cex.names=0.7)
-mtext("GitHub repos grouped by number of ticks on ML code checklist", side=1, line=3, cex=0.8)
-
-
-
-pie(rev(props), col=rev(brewer.pal(6, "Blues")), cex=0.7)
-mtext("Proportion of repositories in each group", side=1, line=3, cex=0.8)
-
-
-Compare using box plots.
- - - -tp = t
-tp$score = as.factor(tp$score)
-par(mfrow=c(1,1))
-boxplot(stars~score, data=t, ylim=c(0,200), col=brewer.pal(6, "Blues"),
- xlab="ML code checklist ticks", ylab="Github stars")
-
-
-Fit robust regression and test significance of results
- - - -print(summary(rlm(stars~training+evaluation+pretrained_model+results+dependencies, data=t)))
-
-
-
-Call: rlm(formula = stars ~ training + evaluation + pretrained_model +
- results + dependencies, data = t)
-Residuals:
- Min 1Q Median 3Q Max
- -118.293 -25.391 -7.406 36.218 36414.707
-
-Coefficients:
- Value Std. Error t value
-(Intercept) -1.0246 11.5557 -0.0887
-training 24.3908 11.8245 2.0627
-evaluation -12.0504 8.8434 -1.3626
-pretrained_model 70.3466 9.1685 7.6726
-results 36.7966 8.8318 4.1664
-dependencies 15.8344 9.2208 1.7172
-
-Residual standard error: 40.03 on 194 degrees of freedom
-
-
-for(i in 0:4){
- cat("Score5 vs Score", i)
- print(wilcox.test(t$stars[t$score==5], t$stars[t$score==i]))
-}
-
-
-Score5 vs Score 0
-
-
-cannot compute exact p-value with ties
-
-
-
- Wilcoxon rank sum test with continuity correction
-
-data: t$stars[t$score == 5] and t$stars[t$score == i]
-W = 174, p-value = 5.943e-05
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 1
-
-
-cannot compute exact p-value with ties
-
-
-
- Wilcoxon rank sum test with continuity correction
-
-data: t$stars[t$score == 5] and t$stars[t$score == i]
-W = 300, p-value = 1.279e-06
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 2
-
-
-cannot compute exact p-value with ties
-
-
-
- Wilcoxon rank sum test with continuity correction
-
-data: t$stars[t$score == 5] and t$stars[t$score == i]
-W = 677, p-value = 4.1e-07
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 3
- Wilcoxon rank sum test with continuity correction
-
-data: t$stars[t$score == 5] and t$stars[t$score == i]
-W = 1082, p-value = 1.22e-05
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 4
-
-
-cannot compute exact p-value with ties
-
-
-
- Wilcoxon rank sum test with continuity correction
-
-data: t$stars[t$score == 5] and t$stars[t$score == i]
-W = 528.5, p-value = 0.01551
-alternative hypothesis: true location shift is not equal to 0
-
-
-
-sessionInfo()
-
-
-R version 3.6.2 (2019-12-12)
-Platform: x86_64-apple-darwin15.6.0 (64-bit)
-Running under: macOS Catalina 10.15.3
-
-Matrix products: default
-BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
-LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
-
-locale:
-[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
-
-attached base packages:
-[1] stats graphics grDevices utils datasets methods base
-
-other attached packages:
- [1] RColorBrewer_1.1-2 MASS_7.3-51.4 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.4
- [6] purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_2.1.3 ggplot2_3.2.1
-[11] tidyverse_1.3.0
-
-loaded via a namespace (and not attached):
- [1] Rcpp_1.0.3 cellranger_1.1.0 pillar_1.4.3 compiler_3.6.2 dbplyr_1.4.2 tools_3.6.2
- [7] lubridate_1.7.4 jsonlite_1.6.1 lifecycle_0.1.0 nlme_3.1-142 gtable_0.3.0 lattice_0.20-38
-[13] pkgconfig_2.0.3 rlang_0.4.4 reprex_0.3.0 cli_2.0.1 DBI_1.1.0 rstudioapi_0.11
-[19] haven_2.2.0 xfun_0.12 withr_2.1.2 xml2_1.2.2 httr_1.4.1 knitr_1.28
-[25] fs_1.3.1 generics_0.0.2 vctrs_0.2.3 hms_0.5.3 grid_3.6.2 tidyselect_1.0.0
-[31] glue_1.3.1 R6_2.4.1 fansi_0.4.1 readxl_1.3.1 modelr_0.1.6 magrittr_1.5
-[37] backports_1.1.5 scales_1.1.0 rvest_0.3.5 assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.6
-[43] lazyeval_0.2.2 munsell_0.5.0 broom_0.5.4 crayon_1.3.4
-
-
-