Commit 6332a49a authored by Simon Pintarelli's avatar Simon Pintarelli

update statistics.Rnw, add some wrapper scripts

parent 24bca575
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Exctract the columns 'Legi-Number', 'Last Name', 'First Name' from the file
Grades*txt (the one exported from edoz). Store result as xls.
"""
# to_excel needs python-xlwt
import pandas as pa
import argparse
import os
selected_columns = ['Number', 'Last Name', 'First Name']
header_de = [
'Id', 'Familienname', 'Vorname', 'Nummer', 'Sekretariat', 'Fachrichtung',
'Rep.', 'Note', '+ / - (Notentendenz)', '* (Abbruch/nicht erschienen)',
'Prüfsumme'
]
header_en = [
'Id', 'Last Name', 'First Name', 'Number', 'Administration Office',
'Direction', 'Rep.', 'Grade', '+ / - (Grade Trend)', '* (dropout/no show)',
'Checksum'
]
translation = {
'Id': 'Id',
'Familienname': 'Last Name',
'Vorname': 'First Name',
'Nummer': 'Number',
'Sekretariat': 'Administration Office',
'Fachrichtung': 'Direction',
'Rep.': 'Rep.',
'Note': 'Grade',
'+ / - (Notentendenz)': '+ / - (Grade Trend)',
'* (Abbruch/nicht erschienen)': '* (dropout/no show)',
'Prüfsumme': 'Checksum'
}
def extract_columns(args):
"""
"""
try:
edoz_data = pa.read_csv(args.src, sep='\t', encoding='ISO-8859-1')
except:
raise Exception('Could not read from csv-file ' + str(args.src) +
str('\nCheck that file exists and is ISO-8859-1 encoded'))
if 'Familienname' in edoz_data.columns:
# there is a tab past there last column => pandas reads an additional empty
# column (filled with nan) get rid of this additional column
edoz_data = edoz_data[header_de]
# translate headers to EN
assert(all([(i in header_de or i in header_en) for i in edoz_data.columns]))
new_col_names = [translation[c] for c in edoz_data.columns]
edoz_data.columns = new_col_names
elif 'Last Name' in edoz_data:
# there is a tab past there last column => pandas reads an additional empty
# column (filled with nan) get rid of this additional column
edoz_data = edoz_data[header_en]
else:
raise Exception(
'Something with the column-headers is wrong (expected to find Last Name). Check your input file %s.'
% fname)
grade_columns = edoz_data[selected_columns]
for i in range(1,5):
grade_columns['A%d' % i] = ''
if not os.path.exists('marks.xls') or args.force:
grade_columns.to_excel(
'marks.xls', encoding='ISO-8859-1', index=False)
else:
raise Exception(
'./marks.xls exists. Delete or rename it and try again.')
if args.to_csv:
if not os.path.exists('marks.csv') or args.force:
grade_columns.to_csv(
'marks.csv', sep=',', encoding='ISO-8859-1', index=False)
else:
raise Exception(
'./marks.csv exists. Delete or rename it and try again.')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'src', help='exported edoz file (Grades|Noten)*txt')
parser.add_argument(
'-f',
'--force',
help='overwrite *csv,xls without asking',
action='store_true')
parser.add_argument('--to-csv', help='Also save as csv (ISO-8859-1)', action='store_true')
args = parser.parse_args()
extract_columns(args)
# coding: utf-8
import pandas as pa
data = pa.read_csv('grades-template.txt', sep='\t', encoding='ISO-8859-1')
grades = pa.read_csv('grades-out.csv', sep=',', encoding='ISO-8859-1')
tmp = grades[['Number', 'Grade']]
out = pa.merge(data, tmp, on=('Number'))
out = out[['Id', 'Last Name', 'First Name', 'Number', 'Administration Office',
'Direction', 'Rep.', 'Grade_y', '+ / - (Grade Trend)',
'* (dropout/no show)', 'Checksum']]
out.columns = ['Id', 'Last Name', 'First Name', 'Number', 'Administration Office',
'Direction', 'Rep.', 'Grade', '+ / - (Grade Trend)',
'* (dropout/no show)', 'Checksum']
out.to_csv('edoz-final.txt', sep='\t', encoding='ISO-8859-1', index=False)
knapp = out[out['Grade']==3.75]
% Author: Simon Pintarelli <simon.pintarelli@sam.math.ethz.ch>
% -------------------------------------------------------------
% Input: marks.csv
% (with columns: 'First Name', 'Last Name', 'Number', 'A1', ...'An')
% -------------------------------------------------------------
% Output:
% - tex-file to generate pdf
% - 'grades-out.txt'
% -------------------------------------------------------------
% Emacs hint: use M-x ess-noweb-set-code-mode, then type R-mode
% Author: Simon Pintarelli <simon.pintarelli@sam.math.ethz.ch>
% -------------------------------------------------------------
\documentclass[a4paper]{article}
\usepackage{subfig}
......@@ -44,7 +53,7 @@ int_grades <- c(1, 4, 6, 6)
# from http://stackoverflow.com/questions/11030898/knitr-how-to-align-code-and-plot-side-by-side
partWidth <- 45
fullWidth <- 80
options(width = fullWidth)
options(width=fullWidth)
## option() settings, just for the current chunk
knit_hooks$set(r.opts=local({
......@@ -65,9 +74,7 @@ knit_hooks$set(r.opts=local({
## case, wrap the usual textual output in LaTeX code placing it in a
## narrower adjustbox environment and setting the graphics that it
## produced in another box beside it.
defaultChunkHook <- environment(knit_hooks[["get"]])$defaults$chunk
codefigChunkHook <- function (x, options) {
main <- defaultChunkHook(x, options)
before <-
......@@ -113,46 +120,77 @@ library(xtable)
@
<<prepare data, echo=FALSE>>=
dset <- read.csv('data.csv', encoding='ISO-8859-1')
dset <- dset[, !(names(dset) %in% c("Familienname", "Vorname"))]
dset <- melt(dset, id=c("Nummer"))
dset <- read.csv('marks.csv', encoding='ISO-8859-1')
dset <- dset[, !(names(dset) %in% c("Last.Name", "First.Name"))]
dset <- melt(dset, id=c("Number"))
dset$value <- as.numeric(dset$value)
total_points <- group_by(dset, Nummer) %>% summarize(s=sum(value))
total_points <- group_by(dset, Number) %>% summarize(s=sum(value))
linscale <- data.frame(approx(int_points, int_grades, seq(0, maxpoints, by=dp)))
scale <- data.frame(grade=floor(linscale$y*1/dg)*dg, s=linscale$x)
grades <- data.frame(left_join(x=total_points, y=scale, by="s"))
colnames(grades) <- c("legi", "points", "grade")
grades$legi <- as.factor(grades$legi)
# plot grading scale (linear interpolation and projection to existing marks)
## plot grading scale (linear interpolation and projection to existing marks)
scale_plot <- cbind(linscale, col=rep("linear interpolation", nrow(linscale)))
colnames(scale_plot) <- c("s", "grade", "col")
scale_plot <- rbind(scale_plot, cbind(scale, col=rep("rounded", nrow(linscale))))
@
<<store grades, echo=FALSE>>=
## export (legi, grade) to csv
write.csv(data.frame(Number=grades$legi, Grade=grades$grade),
fileEncoding='ISO-8859-1',
file="grades-out.csv",
row.names=FALSE,
quote=FALSE, na="")
@
\section*{Grades}
\label{sec:grades}
<<grades, echo=FALSE, warning=TRUE, fig.width=5, fig.height=5, out.width="0.49\\linewidth", fig.show='hold', fig.env="figure*", fig.subcap=c("Histogram", "CDF"), fig.pos='h!'>>=
ggplot(grades, aes(x=grade)) + geom_histogram(binwidth = 0.25, alpha=0.8) + scale_x_continuous(breaks=seq(1,6, by=1)) + ylab("Num. Stud")
ggplot(grades, aes(x=grade)) + stat_ecdf() + scale_x_continuous(breaks=seq(1,6, by=1)) + ylab("Percentage")
ggplot(grades, aes(x=grade)) +
geom_histogram(binwidth = 0.25, alpha=0.8) +
scale_x_continuous(breaks=seq(1,6, by=1)) +
ylab("Num. Stud")
ggplot(grades, aes(x=grade)) +
stat_ecdf() +
scale_x_continuous(breaks=seq(1,6, by=1)) +
ylab("Percentage")
@ %def
<<violing, echo=FALSE, fig.height=1, fig.width=3, fig.show='hold'>>=
ogrades <- cbind.data.frame(x=rep("grade", nrow(grades)), y=grades$grade)
opoints <- cbind.data.frame(x=rep("points", nrow(grades)), y=grades$points)
ggplot(ogrades, aes(x=x,y=y)) + geom_violin() + geom_boxplot(width=.1, fill="black", outlier.colour=NA) +
stat_summary(fun.y=median, geom="point", fill="white", shape=21, size=2.5) + coord_flip() + ylab("") + xlab("")
ggplot(opoints, aes(x=x,y=y)) + geom_violin() + geom_boxplot(width=.1, fill="black", outlier.colour=NA) +
stat_summary(fun.y=median, geom="point", fill="white", shape=21, size=2.5) + coord_flip() + ylab("") + xlab("")
ggplot(ogrades, aes(x=x,y=y)) +
geom_violin() +
geom_boxplot(width=.1, fill="black", outlier.colour=NA) +
stat_summary(fun.y=median, geom="point", fill="white", shape=21, size=2.5) +
coord_flip() +
xlab("") +
ylab("")
ggplot(opoints, aes(x=x,y=y)) +
geom_violin() +
geom_boxplot(width=.1, fill="black", outlier.colour=NA) +
stat_summary(fun.y=median, geom="point", fill="white", shape=21, size=2.5) +
coord_flip() +
xlab("") +
ylab("")
@
<<grading_scale_plot, opts.label="codefig", echo=FALSE, results='asis', cache=FALSE, fig.width=5, fig.height=5>>=
I <- match(seq(1, 6, by=dg), scale$grade)
plot1 <- ggplot(scale_plot, aes(x=s, y=grade, col=col)) + geom_line() + scale_y_continuous(breaks=seq(1,6,by=0.5)) +
scale_x_continuous(breaks=seq(0, maxpoints, by=ceiling(maxpoints/10))) + xlab("points") + theme(legend.position="bottom")
plot1 <- ggplot(scale_plot, aes(x=s, y=grade, col=col)) +
geom_line() +
scale_y_continuous(breaks=seq(1,6,by=0.5)) +
scale_x_continuous(breaks=seq(0, maxpoints, by=ceiling(maxpoints/10))) +
xlab("points") +
theme(legend.position="bottom")
plot1 + theme(legend.title=element_blank())
print(xtable(data.frame(Grade=scale$grade[I], Points=scale$s[I])), include.rownames=FALSE, floating=FALSE)
@
......@@ -172,14 +210,29 @@ ppassed <- round(100*passed/ncand, 1)
<<out,echo=FALSE,warning=FALSE, fig.width=5, fig.height=5, out.width="0.49\\linewidth",fig.cap="Point distribution (per task)",fig.show='hold',fig.subcap=c("Histogram","CDF"), fig.env="figure*", fig.pos='h!'>>=
ggplot(dset, aes(x=value)) + geom_histogram(binwidth=1, alpha=0.8) + facet_wrap(~variable, scales="free") + ylab("Num. Stud.") + xlab("Points")
ggplot(dset, aes(x=value)) + stat_ecdf() + facet_wrap(~variable, scales="free") + xlab("Points") + ylab("Percentage")
ggplot(dset, aes(x=value)) +
geom_histogram(binwidth=1, alpha=0.8) +
facet_wrap(~variable, scales="free") +
ylab("Num. Stud.") +
xlab("Points")
ggplot(dset, aes(x=value)) +
stat_ecdf() +
facet_wrap(~variable, scales="free") +
xlab("Points") +
ylab("Percentage")
@ %def
<<total, echo=FALSE, warning=FALSE, fig.width=5, fig.height=5, out.width="0.49\\linewidth", fig.cap="Point distribution (total)", fig.show='hold', fig.subcap=c("Histogram", "CDF"), fig.env='figure*', fig.pos='h!'>>=
ggplot(total_points, aes(x=s)) + geom_histogram(binwidth=2, alpha=0.8) + ylab("Num. stud.") + xlab("Total points") + xlim(0, maxpoints)
ggplot(total_points, aes(x=s)) + stat_ecdf() + xlim(0, maxpoints) + xlab("Total points")
ggplot(total_points, aes(x=s)) +
geom_histogram(binwidth=2, alpha=0.8) +
ylab("Num. stud.") +
xlab("Total points") +
xlim(0, maxpoints)
ggplot(total_points, aes(x=s)) +
stat_ecdf() +
xlim(0, maxpoints) +
xlab("Total points")
@ %def
<<density, echo=FALSE, warning=FALSE, out.width="0.49\\linewidth", fig.cap="Total", fig.show='hold', fig.pos='h!', fig.height=3>>=
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Convert xls to csv in ISO-8859-1 encoding. Unfortunately python pandas does not
support odt.
"""
import pandas as pa
import sys
import argparse
import os
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('src', help='path/to/xls')
parser.add_argument('dst', help='output')
args = parser.parse_args()
if not os.path.exists(args.src):
raise Exception('Could not read from file ' + str(args.src))
data = pa.read_excel(args.src)
data.to_csv(args.dst, index=False, encoding='ISO-8859-1', sep=',')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment