diff --git a/notebooks/code_checklist-analysis.Rmd b/notebooks/code_checklist-analysis.Rmd index 13b2f4d..a3eac9e 100644 --- a/notebooks/code_checklist-analysis.Rmd +++ b/notebooks/code_checklist-analysis.Rmd @@ -1,11 +1,13 @@ --- -title: "R Notebook" -output: html_notebook +title: "ML Code Completeness Checklist Analysis" +output: + pdf_document: default + html_notebook: default --- -ML Code Completeness analysis for NeurIPS 2019 repositories. +This notebook contains the ML Code Completeness analysis for NeurIPS 2019 repositories. -For a run & rendered version of this notebook please see: [code_checklist-analysis.nb.html](code_checklist-analysis.nb.html). +For a run & rendered version of this notebook please see: [code_checklist-analysis.pdf](code_checklist-analysis.pdf). Official repositories for NeurIPS 2019 papers fetched from: https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-2019 @@ -85,7 +87,7 @@ Fit robust regression and test significance of results print(summary(rlm(stars~training+evaluation+pretrained_model+results+dependencies, data=t))) for(i in 0:4){ - cat("Score5 vs Score", i) + cat("\nScore5 vs Score", i, "\n") print(wilcox.test(t$stars[t$score==5], t$stars[t$score==i])) } ``` diff --git a/notebooks/code_checklist-analysis.nb.html b/notebooks/code_checklist-analysis.nb.html deleted file mode 100644 index c906795..0000000 --- a/notebooks/code_checklist-analysis.nb.html +++ /dev/null @@ -1,2133 +0,0 @@ - - - - - - - - - - - - - -R Notebook - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - -

ML Code Completeness analysis for NeurIPS 2019 repositories.

-

For a run & rendered version of this notebook please see: code_checklist-analysis.nb.html.

-

Official repositories for NeurIPS 2019 papers fetched from: https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-2019

-

A random 25% sample has been selected and manually annotated according to the 5 critera of the ML Code Completness Checklist. The result has been saved into code_checklist-neurips2019.csv.

- - - -
library(tidyverse)
- - -
Registered S3 method overwritten by 'dplyr':
-  method           from
-  print.rowwise_df     
-Registered S3 methods overwritten by 'dbplyr':
-  method         from
-  print.tbl_lazy     
-  print.tbl_sql      
-── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
-✓ ggplot2 3.2.1     ✓ purrr   0.3.3
-✓ tibble  2.1.3     ✓ dplyr   0.8.4
-✓ tidyr   1.0.2     ✓ stringr 1.4.0
-✓ readr   1.3.1     ✓ forcats 0.4.0
-── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-x dplyr::filter() masks stats::filter()
-x dplyr::lag()    masks stats::lag()
- - -
library(ggplot2)
-library(MASS)
- - -

-Attaching package: ‘MASS’
-
-The following object is masked from ‘package:dplyr’:
-
-    select
- - -
library(RColorBrewer)
-
-t = read_csv("code_checklist-neurips2019.csv")
- - -
Parsed with column specification:
-cols(
-  url = col_character(),
-  stars = col_double(),
-  python = col_double(),
-  training = col_double(),
-  evaluation = col_double(),
-  pretrained_model = col_double(),
-  results = col_double(),
-  dependencies = col_double()
-)
- - -
cat("Number of rows:", nrow(t), "\n")
- - -
Number of rows: 221 
- - - -

We’ll focus only on Python repositories, since this is the dominant language in ML and repositories in other languages tend to have a smaller number of stars just because the community is smaller.

- - - -
t = t[t$python==1,]
-cat("Number of rows:", nrow(t), "\n")
- - -
Number of rows: 200 
- - - -

Next, we calculate the score as a sum of of individual checklist items and calculate summary stats.

- - - -
t$score = rowSums(t[,4:8])
- - - -

We group repositories based on their score and calculate summary stats.

- - - -

-cat("Spread of values in each group:\n")
- - -
Spread of values in each group:
- - -
summaries = tapply(t$stars, t$score, summary)
-names(summaries) = paste(names(summaries), "ticks")
-print(summaries)
- - -
$`0 ticks`
-   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-    0.0     0.0     1.5    14.5    10.0    89.0 
-
-$`1 ticks`
-   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-   0.00    0.00    5.00   11.94   11.00   59.00 
-
-$`2 ticks`
-   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-   0.00    4.00   15.00   43.17   30.00  654.00 
-
-$`3 ticks`
-   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-   0.00    6.00   19.00  171.15   75.75 6082.00 
-
-$`4 ticks`
-   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-   1.00   22.25   62.50  457.88  148.50 5114.00 
-
-$`5 ticks`
-    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-   16.00    93.25   196.50  2664.89   517.00 36549.00 
- - -
cat("Proportion of repos in each group:\n")
- - -
Proportion of repos in each group:
- - -
props = tapply(t$stars, t$score, length)
-props = props/sum(props)
-names(props) = paste(names(props), "ticks")
-print(props)
- - -
0 ticks 1 ticks 2 ticks 3 ticks 4 ticks 5 ticks 
-  0.050   0.085   0.205   0.360   0.210   0.090 
- - -
# Extract medians
-medians = unlist(lapply(tapply(t$stars, t$score, summary), function(x) x["Median"]))
-names(medians) = paste(sub(".Median", "", names(medians)), "ticks")
- - - -

Generate summary graphs.

- - - -
par(oma=c(0,1,0,1))
-layout(matrix(c(1,2), 1, 2, byrow = TRUE), widths=c(3,2))
-barplot(medians, 
-        xlab="", 
-        ylab="Median GitHub stars", ylim=c(0,200),
-        col=brewer.pal(6, "Blues"), cex.axis=0.7, cex.names=0.7)
-mtext("GitHub repos grouped by number of ticks on ML code checklist", side=1, line=3, cex=0.8)
- - -

-pie(rev(props), col=rev(brewer.pal(6, "Blues")), cex=0.7)
-mtext("Proportion of repositories in each group", side=1, line=3, cex=0.8)
- - -

- - - -

Compare using box plots.

- - - -
tp = t
-tp$score = as.factor(tp$score)
-par(mfrow=c(1,1))
-boxplot(stars~score, data=t, ylim=c(0,200), col=brewer.pal(6, "Blues"),
-        xlab="ML code checklist ticks", ylab="Github stars")
- - -

- - - -

Fit robust regression and test significance of results

- - - -
print(summary(rlm(stars~training+evaluation+pretrained_model+results+dependencies, data=t)))
- - -

-Call: rlm(formula = stars ~ training + evaluation + pretrained_model + 
-    results + dependencies, data = t)
-Residuals:
-      Min        1Q    Median        3Q       Max 
- -118.293   -25.391    -7.406    36.218 36414.707 
-
-Coefficients:
-                 Value    Std. Error t value 
-(Intercept)       -1.0246  11.5557    -0.0887
-training          24.3908  11.8245     2.0627
-evaluation       -12.0504   8.8434    -1.3626
-pretrained_model  70.3466   9.1685     7.6726
-results           36.7966   8.8318     4.1664
-dependencies      15.8344   9.2208     1.7172
-
-Residual standard error: 40.03 on 194 degrees of freedom
- - -
for(i in 0:4){
-  cat("Score5 vs Score", i)
-  print(wilcox.test(t$stars[t$score==5], t$stars[t$score==i]))
-}
- - -
Score5 vs Score 0
- - -
cannot compute exact p-value with ties
- - -

-    Wilcoxon rank sum test with continuity correction
-
-data:  t$stars[t$score == 5] and t$stars[t$score == i]
-W = 174, p-value = 5.943e-05
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 1
- - -
cannot compute exact p-value with ties
- - -

-    Wilcoxon rank sum test with continuity correction
-
-data:  t$stars[t$score == 5] and t$stars[t$score == i]
-W = 300, p-value = 1.279e-06
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 2
- - -
cannot compute exact p-value with ties
- - -

-    Wilcoxon rank sum test with continuity correction
-
-data:  t$stars[t$score == 5] and t$stars[t$score == i]
-W = 677, p-value = 4.1e-07
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 3
-    Wilcoxon rank sum test with continuity correction
-
-data:  t$stars[t$score == 5] and t$stars[t$score == i]
-W = 1082, p-value = 1.22e-05
-alternative hypothesis: true location shift is not equal to 0
-
-Score5 vs Score 4
- - -
cannot compute exact p-value with ties
- - -

-    Wilcoxon rank sum test with continuity correction
-
-data:  t$stars[t$score == 5] and t$stars[t$score == i]
-W = 528.5, p-value = 0.01551
-alternative hypothesis: true location shift is not equal to 0
- - - -
-

Session information

- - - -
sessionInfo()
- - -
R version 3.6.2 (2019-12-12)
-Platform: x86_64-apple-darwin15.6.0 (64-bit)
-Running under: macOS Catalina 10.15.3
-
-Matrix products: default
-BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
-LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
-
-locale:
-[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
-
-attached base packages:
-[1] stats     graphics  grDevices utils     datasets  methods   base     
-
-other attached packages:
- [1] RColorBrewer_1.1-2 MASS_7.3-51.4      forcats_0.4.0      stringr_1.4.0      dplyr_0.8.4       
- [6] purrr_0.3.3        readr_1.3.1        tidyr_1.0.2        tibble_2.1.3       ggplot2_3.2.1     
-[11] tidyverse_1.3.0   
-
-loaded via a namespace (and not attached):
- [1] Rcpp_1.0.3       cellranger_1.1.0 pillar_1.4.3     compiler_3.6.2   dbplyr_1.4.2     tools_3.6.2     
- [7] lubridate_1.7.4  jsonlite_1.6.1   lifecycle_0.1.0  nlme_3.1-142     gtable_0.3.0     lattice_0.20-38 
-[13] pkgconfig_2.0.3  rlang_0.4.4      reprex_0.3.0     cli_2.0.1        DBI_1.1.0        rstudioapi_0.11 
-[19] haven_2.2.0      xfun_0.12        withr_2.1.2      xml2_1.2.2       httr_1.4.1       knitr_1.28      
-[25] fs_1.3.1         generics_0.0.2   vctrs_0.2.3      hms_0.5.3        grid_3.6.2       tidyselect_1.0.0
-[31] glue_1.3.1       R6_2.4.1         fansi_0.4.1      readxl_1.3.1     modelr_0.1.6     magrittr_1.5    
-[37] backports_1.1.5  scales_1.1.0     rvest_0.3.5      assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.6   
-[43] lazyeval_0.2.2   munsell_0.5.0    broom_0.5.4      crayon_1.3.4    
- - -
- -
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKTUwgQ29kZSBDb21wbGV0ZW5lc3MgYW5hbHlzaXMgZm9yIE5ldXJJUFMgMjAxOSByZXBvc2l0b3JpZXMuIAoKRm9yIGEgcnVuICYgcmVuZGVyZWQgdmVyc2lvbiBvZiB0aGlzIG5vdGVib29rIHBsZWFzZSBzZWU6IFtjb2RlX2NoZWNrbGlzdC1hbmFseXNpcy5uYi5odG1sXShjb2RlX2NoZWNrbGlzdC1hbmFseXNpcy5uYi5odG1sKS4KCk9mZmljaWFsIHJlcG9zaXRvcmllcyBmb3IgTmV1cklQUyAyMDE5IHBhcGVycyBmZXRjaGVkIGZyb206IGh0dHBzOi8vcGFwZXJzLm5pcHMuY2MvYm9vay9hZHZhbmNlcy1pbi1uZXVyYWwtaW5mb3JtYXRpb24tcHJvY2Vzc2luZy1zeXN0ZW1zLTMyLTIwMTkKCkEgcmFuZG9tIDI1JSBzYW1wbGUgaGFzIGJlZW4gc2VsZWN0ZWQgYW5kIG1hbnVhbGx5IGFubm90YXRlZCBhY2NvcmRpbmcgdG8gdGhlIDUgY3JpdGVyYSBvZiB0aGUgTUwgQ29kZSBDb21wbGV0bmVzcyBDaGVja2xpc3QuIFRoZSByZXN1bHQgaGFzIGJlZW4gc2F2ZWQgaW50byBgY29kZV9jaGVja2xpc3QtbmV1cmlwczIwMTkuY3N2YC4KCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KE1BU1MpCmxpYnJhcnkoUkNvbG9yQnJld2VyKQoKdCA9IHJlYWRfY3N2KCJjb2RlX2NoZWNrbGlzdC1uZXVyaXBzMjAxOS5jc3YiKQpjYXQoIk51bWJlciBvZiByb3dzOiIsIG5yb3codCksICJcbiIpCmBgYAoKV2UnbGwgZm9jdXMgb25seSBvbiBQeXRob24gcmVwb3NpdG9yaWVzLCBzaW5jZSB0aGlzIGlzIHRoZSBkb21pbmFudCBsYW5ndWFnZSBpbiBNTCBhbmQgcmVwb3NpdG9yaWVzIGluIG90aGVyIGxhbmd1YWdlcyB0ZW5kIHRvIGhhdmUgYSBzbWFsbGVyIG51bWJlciBvZiBzdGFycyBqdXN0IGJlY2F1c2UgdGhlIGNvbW11bml0eSBpcyBzbWFsbGVyLiAKCmBgYHtyfQp0ID0gdFt0JHB5dGhvbj09MSxdCmNhdCgiTnVtYmVyIG9mIHJvd3M6IiwgbnJvdyh0KSwgIlxuIikKYGBgCgpOZXh0LCB3ZSBjYWxjdWxhdGUgdGhlIHNjb3JlIGFzIGEgc3VtIG9mIG9mIGluZGl2aWR1YWwgY2hlY2tsaXN0IGl0ZW1zIGFuZCBjYWxjdWxhdGUgc3VtbWFyeSBzdGF0cy4gCgpgYGB7cn0KdCRzY29yZSA9IHJvd1N1bXModFssNDo4XSkKYGBgCgpXZSBncm91cCByZXBvc2l0b3JpZXMgYmFzZWQgb24gdGhlaXIgc2NvcmUgYW5kIGNhbGN1bGF0ZSBzdW1tYXJ5IHN0YXRzLiAKCmBgYHtyfQoKY2F0KCJTcHJlYWQgb2YgdmFsdWVzIGluIGVhY2ggZ3JvdXA6XG4iKQpzdW1tYXJpZXMgPSB0YXBwbHkodCRzdGFycywgdCRzY29yZSwgc3VtbWFyeSkKbmFtZXMoc3VtbWFyaWVzKSA9IHBhc3RlKG5hbWVzKHN1bW1hcmllcyksICJ0aWNrcyIpCnByaW50KHN1bW1hcmllcykKCmNhdCgiUHJvcG9ydGlvbiBvZiByZXBvcyBpbiBlYWNoIGdyb3VwOlxuIikKcHJvcHMgPSB0YXBwbHkodCRzdGFycywgdCRzY29yZSwgbGVuZ3RoKQpwcm9wcyA9IHByb3BzL3N1bShwcm9wcykKbmFtZXMocHJvcHMpID0gcGFzdGUobmFtZXMocHJvcHMpLCAidGlja3MiKQpwcmludChwcm9wcykKCiMgRXh0cmFjdCBtZWRpYW5zCm1lZGlhbnMgPSB1bmxpc3QobGFwcGx5KHRhcHBseSh0JHN0YXJzLCB0JHNjb3JlLCBzdW1tYXJ5KSwgZnVuY3Rpb24oeCkgeFsiTWVkaWFuIl0pKQpuYW1lcyhtZWRpYW5zKSA9IHBhc3RlKHN1YigiLk1lZGlhbiIsICIiLCBuYW1lcyhtZWRpYW5zKSksICJ0aWNrcyIpCmBgYAoKR2VuZXJhdGUgc3VtbWFyeSBncmFwaHMuIAoKYGBge3J9CnBhcihvbWE9YygwLDEsMCwxKSkKbGF5b3V0KG1hdHJpeChjKDEsMiksIDEsIDIsIGJ5cm93ID0gVFJVRSksIHdpZHRocz1jKDMsMikpCmJhcnBsb3QobWVkaWFucywgCiAgICAgICAgeGxhYj0iIiwgCiAgICAgICAgeWxhYj0iTWVkaWFuIEdpdEh1YiBzdGFycyIsIHlsaW09YygwLDIwMCksCiAgICAgICAgY29sPWJyZXdlci5wYWwoNiwgIkJsdWVzIiksIGNleC5heGlzPTAuNywgY2V4Lm5hbWVzPTAuNykKbXRleHQoIkdpdEh1YiByZXBvcyBncm91cGVkIGJ5IG51bWJlciBvZiB0aWNrcyBvbiBNTCBjb2RlIGNoZWNrbGlzdCIsIHNpZGU9MSwgbGluZT0zLCBjZXg9MC44KQoKcGllKHJldihwcm9wcyksIGNvbD1yZXYoYnJld2VyLnBhbCg2LCAiQmx1ZXMiKSksIGNleD0wLjcpCm10ZXh0KCJQcm9wb3J0aW9uIG9mIHJlcG9zaXRvcmllcyBpbiBlYWNoIGdyb3VwIiwgc2lkZT0xLCBsaW5lPTMsIGNleD0wLjgpCmBgYAoKQ29tcGFyZSB1c2luZyBib3ggcGxvdHMuIAoKYGBge3J9CnRwID0gdAp0cCRzY29yZSA9IGFzLmZhY3Rvcih0cCRzY29yZSkKcGFyKG1mcm93PWMoMSwxKSkKYm94cGxvdChzdGFyc35zY29yZSwgZGF0YT10LCB5bGltPWMoMCwyMDApLCBjb2w9YnJld2VyLnBhbCg2LCAiQmx1ZXMiKSwKICAgICAgICB4bGFiPSJNTCBjb2RlIGNoZWNrbGlzdCB0aWNrcyIsIHlsYWI9IkdpdGh1YiBzdGFycyIpCmBgYAoKRml0IHJvYnVzdCByZWdyZXNzaW9uIGFuZCB0ZXN0IHNpZ25pZmljYW5jZSBvZiByZXN1bHRzCgpgYGB7cn0KcHJpbnQoc3VtbWFyeShybG0oc3RhcnN+dHJhaW5pbmcrZXZhbHVhdGlvbitwcmV0cmFpbmVkX21vZGVsK3Jlc3VsdHMrZGVwZW5kZW5jaWVzLCBkYXRhPXQpKSkKCmZvcihpIGluIDA6NCl7CiAgY2F0KCJTY29yZTUgdnMgU2NvcmUiLCBpKQogIHByaW50KHdpbGNveC50ZXN0KHQkc3RhcnNbdCRzY29yZT09NV0sIHQkc3RhcnNbdCRzY29yZT09aV0pKQp9CmBgYAoKIyMjIFNlc3Npb24gaW5mb3JtYXRpb24KCmBgYHtyfQpzZXNzaW9uSW5mbygpCmBgYAo=
- - - -
- - - - - - - - - - - - - - - - diff --git a/notebooks/code_checklist-analysis.pdf b/notebooks/code_checklist-analysis.pdf new file mode 100644 index 0000000..01c95d1 Binary files /dev/null and b/notebooks/code_checklist-analysis.pdf differ