Parcourir la source

Implement final version of the analyser

Lucas de Souza il y a 3 ans
Parent
commit
d472cddc6f
11 fichiers modifiés avec 790 ajouts et 29 suppressions
  1. 33 21
      analyser.py
  2. 139 0
      ccleaner.py
  3. 97 8
      cjson.py
  4. 11 0
      cminify.sh
  5. 60 0
      csvParser.py
  6. 14 0
      main.py
  7. 94 0
      pycleaner.py
  8. 107 0
      submissionAnalysis.py
  9. 98 0
      submissionFileReader.py
  10. 43 0
      threadPool.py
  11. 94 0
      vplAnalyzer.py

+ 33 - 21
analyser.py

@@ -19,6 +19,9 @@ VALID_OPS = list(['*', '-', '/', '+', '%', '++', '--', 'p--', 'p++'])
 VALID_OPS.extend(VALID_LOGIC_OPS)
 VALID_OPS.extend(VALID_REL_OPS)
 
+PRINT_COUNT = 0
+SCAN_COUNT = 0
+
 def normalizeType(type):
   if type in INT_TYPES:
     return 'int'
@@ -35,13 +38,13 @@ class DeclarationInfo:
 
   def isVector (self) :
     return self.dimensions == 1
-  
+
   def isMatrix (self):
     return self.dimensions == 2
-  
+
   def isMultiDimension (self):
     return self.dimensions > 2
-  
+
   def isArray (self):
     return self.dimensions > 0
 
@@ -52,7 +55,7 @@ class CommandInfo:
     self.numLogicOps = numLogicOps
     self.numRelOps = numRelOps
     self.opList = opList
-  
+
   def __str__(self):
     return "Type:%s LogicOpCount:%d RelOpCount:%d OpList:%s"%( self.condType, self.numLogicOps, self.numRelOps, self.opList)
 
@@ -61,7 +64,7 @@ class ForCommandInfo (CommandInfo):
     CommandInfo.__init__(self, cmdCount, condType, numLogicOps, numRelOps, opList)
     self.useAssignment = useAssignment
     self.useNext = useNext
-  
+
   def __str__(self):
     return "hasInit:%s hasNext:%s Type:%s LogicOpCount:%d RelOpCount:%d OpList:%s"%(self.useAssignment, self.useNext, self.condType, self.numLogicOps, self.numRelOps, self.opList)
 
@@ -87,10 +90,10 @@ class ASTAnalyser:
     self.declarationsPointers = dict()
     self.declarationsVectors = dict()
     self.declarationsMatrixes = dict()
-  
+
   def conditionCommandStr (self) :
     return [ s.__str__() for s in self.conditionCommandData]
-  
+
   def forCommandStr (self) :
     return [ s.__str__() for s in self.forCommandData]
 
@@ -139,7 +142,7 @@ class ASTAnalyser:
         self.declarations[type] += 1
       else:
         self.declarations[type] = 1
-  
+
   def proccessDecl (self, node):
     type = node.type
     dimensions = 0
@@ -160,7 +163,7 @@ class ASTAnalyser:
           self.constantInitCount[init.value] += 1
         else:
           self.constantInitCount[init.value] = 1
-  
+
   def proccessFuncDef (self, node):
     name = node.__class__.__name__
     self.incCmdCount(name)
@@ -169,24 +172,29 @@ class ASTAnalyser:
       self.proccessCommand(cmd)
 
   def proccessFuncCall (self, node):
+    global PRINT_COUNT, SCAN_COUNT
     name = node.__class__.__name__
+    if node.name.name == "printf":
+      PRINT_COUNT = PRINT_COUNT + 1
+    elif node.name.name == "scanf":
+      SCAN_COUNT = SCAN_COUNT + 1
     self.incCmdCount(name)
     epxrs = node.args.exprs
     for e in epxrs:
       self.countOperators(e)
-    
+
   def proccessAssignment (self, node):
     name = node.__class__.__name__
     self.incCmdCount(name)
     epxr = node.rvalue
     self.countOperators(epxr)
-  
+
   def proccessReturn (self, node):
     name = node.__class__.__name__
     self.incCmdCount(name)
     epxr = node.expr
     self.countOperators(epxr)
-  
+
   def proccessSwitch (self, node):
     name = node.__class__.__name__
     self.incCmdCount(name)
@@ -195,7 +203,7 @@ class ASTAnalyser:
     cmdList = node.stmt.block_items
     for cmd in cmdList:
       self.proccessCommand(cmd)
-  
+
   def proccessDoWhile (self, name, node):
     self.incCmdCount(name)
     epxr = node.cond
@@ -216,7 +224,7 @@ class ASTAnalyser:
     else:
       self.proccessCommand(cmdList)
     self.conditionCommandData.append(CommandInfo(self.cmdCountStack.pop(), condType,logicCount, relCount, opList))
-  
+
   def proccessFor (self, node):
     name = node.__class__.__name__
     self.incCmdCount(name)
@@ -233,7 +241,7 @@ class ASTAnalyser:
     hasInit = node.init.__class__.__name__ != 'NoneType'
     if hasInit:
       self.proccessCommand(node.init)
-    
+
     hasNext = node.next.__class__.__name__ != 'NoneType'
     if hasNext:
       self.proccessCommand(node.next)
@@ -249,7 +257,7 @@ class ASTAnalyser:
     elif name != 'NoneType':
       self.proccessCommand(cmdList)
     self.forCommandData.append(ForCommandInfo(hasInit, hasNext, self.cmdCountStack.pop(), condType, logicCount, relCount, opList))
-  
+
   def proccessIf (self, node):
     name = node.__class__.__name__
     self.incCmdCount(name)
@@ -271,17 +279,21 @@ class ASTAnalyser:
         self.proccessCommand(cmd)
     else:
       self.proccessCommand(iftrue)
-
     iffalse = node.iffalse
     ifCompound = iffalse.__class__.__name__
     if ifCompound == 'Compound':
+      # TODO contar else's
+      self.cmdCountStack.append(0)
+      self.incCmdCount('Else')
       cmdList = iffalse.block_items
       for cmd in cmdList:
         self.proccessCommand(cmd)
-    elif name != 'NoneType':
+    elif iffalse != None and name != 'NoneType':
+      self.cmdCountStack.append(0)
+      self.incCmdCount('Else')
       self.proccessCommand(iffalse)
     self.conditionCommandData.append(CommandInfo(self.cmdCountStack.pop(), condType, logicCount, relCount, opList))
-  
+
   def proccessCase (self, node):
     try:
       epxr = node.expr
@@ -335,7 +347,7 @@ class ASTAnalyser:
       self.commandCount[cmd] += 1
     else:
       self.commandCount[cmd] = 1
-  
+
   def checkCondType (self, expr):
     name = expr.__class__.__name__
     if name == 'BinaryOp':
@@ -385,4 +397,4 @@ class ASTAnalyser:
       self.proccessFor(node)
     else:
       self.cmdCountStack[-1] -= 1
-      #print("Unknown Command: %s" % name )
+      #print("Unknown Command: %s" % name )

+ 139 - 0
ccleaner.py

@@ -0,0 +1,139 @@
+import re
+
+VAR = r"(?:int|float|double|char|long|string)\s*\*?\s+([^;\(\)]+);$"
+FUNC = re.compile(r"(?:int|float|double|char|long|string|void)\s*\*?\s+([a-zA-Z0-9_]+)\s*\([^)]*\)\s*(?:{|;)")
+FUNC_PARAM = re.compile(r"\(\s*(?:int|float|double|char|long|string)\s*\*?\s+([^;\(\)]+)\)\s*(?:{|;)")
+PARAM_DECL = re.compile(r"(?:int|float|double|char|long|string)\s*\*?\s+([a-zA-Z0-9_]+)")
+
+class CodeCleaner:
+    def __init__ (self):
+        self.varCount = 0
+        self.funcCount = 0
+        self.symbolMap = {}
+
+#    def cleanCode (self,text):
+#        lines = text.splitlines()
+#        for line in lines:
+#            if FUNC.search(line) != None:
+#                self.cleanFunc(line)
+#                if FUNC_PARAM.search(line) != None:
+#                    self.cleanFuncParam(line)
+#            elif VAR.search(line) != None:
+#                self.cleanVar(line)
+#        regex = {}
+#        for k in self.symbolMap:
+#            regex[k] = re.compile(r"[^\"](\W|^)({})(\W)".format(k))
+#        finalText = []
+#        for line in lines:
+#            for k in regex:
+#                line = regex[k].sub(r"\1{}\3".format(self.symbolMap[k]), line)
+#            finalText.append(line)
+#
+#        return "\n".join(finalText)
+    def cleanCode (self,text):
+        funcs = FUNC.finditer(text)
+        for _,m in enumerate(funcs, start=1):
+            #print("Func delc: {}".format(m.group()))
+            if m == None:
+                continue
+            self.cleanFunc(m.group())
+            params = FUNC_PARAM.finditer(m.group())
+            #print("p",list(params))
+            for _, p in enumerate(params, start=1):
+                if p == None:
+                    continue
+                #print("Func param: {}".format(p.group()))
+                self.cleanFuncParam(p.group())
+        nvars = re.finditer(VAR,text,re.MULTILINE)
+        for _, m in enumerate(nvars, start=1):
+            if m == None:
+                continue
+            #print("vars: {}".format(m.group()))
+            self.cleanVar(m.group())
+        #print(self.symbolMap)
+        stringMode = False
+        lineComment = False
+        blockComment = False
+        lastChar = ''
+        output = []
+        alpha = ""
+        for c in text:
+            if stringMode:
+                stringMode = not (c == '"')
+                output.append(c)
+            elif lineComment:
+                lineComment = c != '\n'
+                output.append(c)
+            elif blockComment:
+                blockComment = (lastChar+c) != '*/'
+                output.append(c)
+            elif c == '/':
+                if lastChar == '/':
+                    lineComment = True
+                output.append(c)
+            elif c == '*':
+                if lastChar == '/':
+                    blockComment = True
+                output.append(c)
+            elif c == '"':
+                alpha = ""
+                stringMode = True
+                output.append(c)
+            elif re.match("[a-zA-Z0-9_]",c) != None:
+                alpha += c
+            else:
+                if len(alpha) > 0 and alpha in self.symbolMap:
+                    #print("Replacing {} with {}.".format(alpha, self.symbolMap[alpha]))
+                    output.append(self.symbolMap[alpha])
+                    alpha = ""
+                elif len(alpha) > 0:
+                    #print("Reinserting {}".format(alpha))
+                    output.append(alpha)
+                    alpha = ""
+                output.append(c)
+            lastChar = c
+
+        return "".join(output)
+
+    def cleanFunc (self,line):
+        match = FUNC.search(line)
+        if match == None:
+            print(line)
+        varID = match.group(1)
+        self.symbolMap[varID] = "f{}".format(self.funcCount)
+        self.funcCount += 1
+
+    def cleanFuncParam (self,line):
+        match = FUNC_PARAM.findall(line)[0]
+        ids = match.split(",")
+        for i in range(len(ids)):
+            varID = ids[i].strip()
+            if varID.find(" ") == -1:
+                varID = re.search("[a-zA-Z_][a-zA-Z0-9_]*",varID).group()
+                if varID not in self.symbolMap:
+                    self.symbolMap[varID] = "v{}".format(self.varCount)
+                    self.varCount += 1
+            else:
+                varID = varID.split(" ")[1].strip()
+                isMatch = re.search("[a-zA-Z_][a-zA-Z0-9_]*",varID)
+                if isMatch == None:
+                    print(match, ids[i])
+                    continue
+                if varID not in self.symbolMap:
+                    self.symbolMap[varID] = "v{}".format(self.varCount)
+                    self.varCount += 1
+
+    def cleanVar (self,line):
+        match = re.findall(VAR, line)[0]
+        ids = match.split(",")
+        for i in range(len(ids)):
+            varID = ids[i].strip()
+            if varID.find("=") != -1:
+                varID = varID.split("=")[0].strip()
+            isMatch = re.search("[a-zA-Z_][a-zA-Z0-9_]*",varID)
+            if isMatch == None:
+                continue
+            varID = isMatch.group()
+            if varID not in self.symbolMap:
+                self.symbolMap[varID] = "v{}".format(self.varCount)
+                self.varCount += 1

+ 97 - 8
cjson.py

@@ -1,11 +1,9 @@
-from __future__ import print_function
-
 import sys
 import os
 sys.path.extend(['.', '..'])
 from pycparser import parse_file, c_ast
 
-from Queue import Queue
+import queue
 from threading import Thread
 
 import re
@@ -13,6 +11,7 @@ import copy
 
 import files
 import analyser
+from submissionFileReader import getSubmissionFile
 
 COMMENT_REGEX = r"(//.*)|(/\*[\w\W\n\r]*?\*/)"
 USEFUL_REGEX = r"(//.*)|(/\*[\w\W\n\r]*?\*/)|(^\s*$)|(\{\s*\})|(^\s*\{\s*$)|(^\s*\}\s*$)"
@@ -40,7 +39,7 @@ class Worker (Thread):
         self.tasks = tasks
         self.daemon = True
         self.start()
-    
+
     def run (self):
         while True:
             func, args, kargs = self.tasks.get()
@@ -52,7 +51,7 @@ class Worker (Thread):
 class ThreadPool:
     """Pool of threads consuming tasks from a queue"""
     def __init__ (self, num_threads):
-        self.tasks = Queue(num_threads)
+        self.tasks = queue.Queue(num_threads)
         for _ in range(num_threads): Worker(self.tasks)
 
     def add_task (self, func, *args, **kargs):
@@ -131,13 +130,103 @@ def saveToFile (filePath, data):
     file.write(data)
     file.close()
 
+def processDataFromCSV (parser, vplFolder):
+    assingments = parser.exercises
+    data = {}
+    for e in assingments:
+        (allSubs, students) = parser.getSubmissions(e)
+        studentData = []
+        for student in students:
+            submissions = parser.getStudentValidSubmissions(allSubs, student)
+            try:
+                assert len(submissions) > 0
+                submissions.sort(key = lambda x : x.submission_id)
+                lastSub = submissions[-1]
+                content = getSubmissionFile(vplFolder, e, lastSub.submission_id)
+                studentData.append((student, content, len(submissions)))
+            except Exception:
+                studentData.append((student, "", len(submissions)))
+        data[e] = studentData[:]
+
+    pool = ThreadPool(10)
+    for a in data:
+        for studentData in data[a]:
+            pool.add_task(processStudentData, studentData, a)
+    pool.wait_completion()
+    mainCSVFile = ""
+    forCSVFile = ""
+    condCSVFile = ""
+    assignmentList = dict()
+    constantInitCount = dict()
+    for data in finalDataList:
+        if data[0].assignment in assignmentList:
+            assignmentList[data[0].assignment].append(data)
+        else:
+            assignmentList[data[0].assignment] = list()
+            assignmentList[data[0].assignment].append(data)
+    for assignmentKey in assignmentList:
+        for studentData in assignmentList[assignmentKey]:
+            astInfo = studentData[0]
+            for k in astInfo.constantInitCount:
+                if k in constantInitCount:
+                    constantInitCount[k] += astInfo.constantInitCount[k]
+                else:
+                    constantInitCount[k] = astInfo.constantInitCount[k]
+            studentOpData = initEmptyDict(analyser.VALID_OPS)
+            for key in astInfo.operatorsCount:
+                studentOpData[key] = astInfo.operatorsCount[key]
+            studentCommandData = initEmptyDict(COMMANDS)
+            for key in astInfo.commandCount:
+                studentCommandData[key] = astInfo.commandCount[key]
+            studentDeclarationData = initEmptyDict(DECLARATIONS)
+            for key in astInfo.declarations:
+                studentDeclarationData[key] = astInfo.declarations[key]
+            for key in astInfo.declarationsPointers:
+                studentDeclarationData["pointer_" + key] = astInfo.declarationsPointers[key]
+            for key in astInfo.declarationsVectors:
+                studentDeclarationData["vector_" + key] = astInfo.declarationsVectors[key]
+            for key in astInfo.declarationsMatrixes:
+                studentDeclarationData["matrix_" + key] = astInfo.declarationsMatrixes[key]
+            mainCSVFile += "%s,%s,%s,%s,%s,%s" % (assignmentKey, astInfo.student, studentData[1], studentData[4], studentData[2], studentData[3])
+            mainCSVFile += "," + ','.join([str(v) for v in studentOpData.values()])
+            mainCSVFile += "," + ",".join([str(v) for v in studentCommandData.values()])
+            mainCSVFile += "," + ",".join([str(v) for v in studentDeclarationData.values()])
+            mainCSVFile += "\n"
+            #For_structure.csv
+            for i in astInfo.forCommandData:
+                forCSVFile += "%s,%s,%s,%s,%s,%s" % (assignmentKey, astInfo.student, i.cmdCount, i.condType, i.numLogicOps, i.numRelOps)
+                opData = initEmptyDict(analyser.VALID_OPS)
+                for op in i.opList:
+                    opData[op] += 1
+                forCSVFile += "," + ','.join([str(v) for v in opData.values()])
+                forCSVFile += ",%s,%s\n" % (i.useAssignment, i.useNext)
+            #condition_structure.csv
+            for i in astInfo.conditionCommandData:
+                condCSVFile += "%s,%s,%s,%s,%s,%s" % (assignmentKey, astInfo.student, i.cmdCount, i.condType, i.numLogicOps, i.numRelOps)
+                opData = initEmptyDict(analyser.VALID_OPS)
+                for op in i.opList:
+                    opData[op] += 1
+                condCSVFile += "," + ','.join([str(v) for v in opData.values()])
+                condCSVFile += "\n"
+    mainCSVFile = ','.join(CSV_HEADER) + '\n' + mainCSVFile
+    saveToFile("data.csv", mainCSVFile)
+    forCSVFile = ','.join(FOR_CSV_HEADER) + '\n' + forCSVFile
+    saveToFile("for_structure.csv", forCSVFile)
+    condCSVFile = ','.join(COND_CSV_HEADER) + '\n' + condCSVFile
+    saveToFile("cond_structure.csv", condCSVFile)
+    constantInitFile = "constant,count\n"
+    for k in constantInitCount:
+        constantInitFile += "%s,%s\n" % (k, str(constantInitCount[k]))
+    saveToFile("const_init.csv", constantInitFile)
+    print("Entrada: {}, Saida: {}".format(analyser.PRINT_COUNT,analyser.SCAN_COUNT))
+
 #--- run ---#
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         file = sys.argv[1]
         if file == "-f" and len(sys.argv) > 2:
             print(processFile(sys.argv[2]))
-        elif file != "-f":    
+        elif file != "-f":
             raiz = "./" + sys.argv[1]
             data = loadAssignments(raiz)
             pool = ThreadPool(10)
@@ -209,8 +298,8 @@ if __name__ == "__main__":
             constantInitFile = "constant,count\n"
             for k in constantInitCount:
                 constantInitFile += "%s,%s\n" % (k, str(constantInitCount[k]))
-            saveToFile("const_init.csv", constantInitFile) 
+            saveToFile("const_init.csv", constantInitFile)
         else:
-            print("cjson -f file | cjon folder/")    
+            print("cjson -f file | cjon folder/")
     else:
         print("cjson -f file | cjon folder/")

+ 11 - 0
cminify.sh

@@ -0,0 +1,11 @@
+#!/bin/bash
+if [[ -z $1 ]]
+then
+  echo "No param provided!"
+  exit 1
+fi
+sed -rb 's/ {6}//g' $1 |
+sed -rb 's/\/\/.*$//g' |
+tr -d '\n' |
+sed -rb 's/\/\*.*\*\///g' |
+sed -rb 's/(#include.*>)/\1\n/g'

+ 60 - 0
csvParser.py

@@ -0,0 +1,60 @@
+import pandas as pd
+from dateutil.tz import tzoffset
+from dateutil.parser import parse
+from dateutil.utils import default_tzinfo
+
+class CSVParser:
+
+    def __init__ (self, path):
+        self.df = pd.read_csv(path)
+        tz = tzoffset(name="saw", offset=-10800)
+        self.df['time'] = self.df['time'].transform(lambda x: default_tzinfo(parse(x), tz).timestamp())
+        self.exercises = self.df['exercise_id'].unique().tolist()
+        self.exercises.sort()
+
+    def getSubmissions (self, exercise_id):
+        dataFrame:pd.DataFrame = self.df
+        submissionData = dataFrame[dataFrame['exercise_id'] == exercise_id]
+        if 0.01 in submissionData['grade'].values :
+            submissionData['grade'] = submissionData['grade'].transform(lambda x: x*10 if x != 1 and not pd.isna(x) else x)
+        students = submissionData['user_id'].unique().tolist()
+        students.sort()
+        return (submissionData, students)
+
+    def getStudentFirstInteraction (self, submissionData, studentID):
+        # filter and sort user_id submission and turn them in tuples
+        subTuple = submissionData[submissionData['user_id'] == studentID].sort_values(by='time').itertuples(index=False, name="Submission")
+        subList = list(subTuple)
+        size = len(subList)
+        for i in range(size):
+            if subList[i].action == 'uploaded_submission':
+                foundPos = self._backtrackViewDescription(subList, i)
+                if foundPos >= 0:
+                    return subList[foundPos].time
+                else:
+                    return -1
+        print("firt intereaction not found: %i"%size)
+        return -1
+
+    def getStudentValidSubmissions (self, submissionData, studentID):
+        # filter and sort user_id submission and turn them in tuples
+        filteredDF = submissionData[(submissionData['user_id'] == studentID) & (submissionData['action'] == 'uploaded_submission')].dropna().sort_values(by='time')
+        filteredDF = filteredDF.drop_duplicates(subset="submission_id")
+        subList = list(filteredDF.itertuples(index=False, name="Submission"))
+        return subList
+
+    def getStudentLastSubmission (self, submissionData, studentID):
+        filteredDF = submissionData[(submissionData['user_id'] == studentID) & (submissionData['action'] == 'uploaded_submission')].dropna().sort_values(by='submission_id')
+        filteredDF = filteredDF.drop_duplicates(subset="submission_id")
+        subList = list(filteredDF.itertuples(index=False, name="Submission"))
+        return subList
+
+    def _backtrackViewDescription (self, subList, start_pos):
+        i = start_pos - 1
+        while (i >= 0):
+            if subList[i].action == 'view_description':
+                return i
+            i = i - 1
+        print("Backtrack view not found")
+        return -1
+

+ 14 - 0
main.py

@@ -0,0 +1,14 @@
+from csvParser import CSVParser
+from cjson import processDataFromCSV
+import sys
+
+
+def bootstrap (csvPath, vplFolder):
+    parser = CSVParser(csvPath)
+    processDataFromCSV(parser, vplFolder)
+
+
+#--- run ---#
+if __name__ == "__main__":
+    assert len(sys.argv) == 3, "You must provide the following: path to the csv, the folder with vpl data"
+    bootstrap(sys.argv[1], sys.argv[2])

+ 94 - 0
pycleaner.py

@@ -0,0 +1,94 @@
+import re
+
+VAR = r"([a-zA-Z_][a-zA-Z0-9_]*)\s*=[^\n]+$"
+FUNC = re.compile(r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\([^\)]*\)\s*:\s*")
+FUNC_PARAM = re.compile(r"\(([^\)]*)\)\s*:\s*")
+
+class CodeCleaner:
+    def __init__ (self):
+        self.varCount = 0
+        self.funcCount = 0
+        self.symbolMap = {}
+
+    def cleanCode (self,text):
+        funcs = FUNC.finditer(text)
+        for _,m in enumerate(funcs, start=1):
+            #print("Func delc: {}".format(m.group()))
+            if m == None:
+                continue
+            self.cleanFunc(m.group())
+            params = FUNC_PARAM.finditer(m.group())
+            #print("p",list(params))
+            for _, p in enumerate(params, start=1):
+                if p == None:
+                    continue
+                #print("Func param: {}".format(p.group()))
+                self.cleanFuncParam(p.group())
+        nvars = re.finditer(VAR,text,re.MULTILINE)
+        for _, m in enumerate(nvars, start=1):
+            if m == None:
+                continue
+            #print("vars: {}".format(m.group()))
+            self.cleanVar(m.group())
+        #print(self.symbolMap)
+        stringMode = False
+        openChar = None
+        lineComment = False
+        output = []
+        alpha = ""
+        for c in text:
+            if stringMode:
+                stringMode = not (c == openChar)
+                output.append(c)
+            elif lineComment:
+                lineComment = c != '\n'
+                #output.append(c)
+            elif c == '#':
+                lineComment = True
+                #output.append(c)
+            elif c == '"' or c == '\'':
+                alpha = ""
+                stringMode = True
+                openChar = c
+                output.append(c)
+            elif re.match("[a-zA-Z0-9_]",c) != None:
+                alpha += c
+            else:
+                if len(alpha) > 0 and alpha in self.symbolMap:
+                    #print("Replacing {} with {}.".format(alpha, self.symbolMap[alpha]))
+                    output.append(self.symbolMap[alpha])
+                    alpha = ""
+                elif len(alpha) > 0:
+                    #print("Reinserting {}".format(alpha))
+                    output.append(alpha)
+                    alpha = ""
+                output.append(c)
+
+        output = "".join(output)
+        return "".join([l for l in output.splitlines() if len(l.strip()) > 0])
+
+    def cleanFunc (self,line):
+        match = FUNC.search(line)
+        varID = match.group(1)
+        self.symbolMap[varID] = "f{}".format(self.funcCount)
+        self.funcCount += 1
+
+    def cleanFuncParam (self,line):
+        match = FUNC_PARAM.findall(line)[0]
+        if len(match.strip()) <= 0:
+            return
+        ids = match.split(",")
+        for i in range(len(ids)):
+            varID = ids[i].strip()
+            if varID not in self.symbolMap:
+                self.symbolMap[varID] = "v{}".format(self.varCount)
+                self.varCount += 1
+
+    def cleanVar (self,line):
+        match = re.findall(VAR, line,re.M)
+        ids = match
+        for i in range(len(ids)):
+            varID = ids[i].strip()
+            if varID not in self.symbolMap:
+                self.symbolMap[varID] = "v{}".format(self.varCount)
+                self.varCount += 1

+ 107 - 0
submissionAnalysis.py

@@ -0,0 +1,107 @@
+from submissionFileReader import readSubmissionContent, levenshteinDistance
+from collections import namedtuple
+
+
+class SubmissionAnalysis:
+
+    def __init__ (self, exerciseID):
+        self.exerciseID = exerciseID
+        self.data = list()
+        self.SubmissionData = namedtuple("SubmissionData", "student_id f{0}_TES f{0}_DES f{0}_grade f{0}_DT f{0}_timestamp".format(exerciseID))
+        headerMap = {}
+        headerMap["f{0}_TES".format(exerciseID)] = "{0}_TES".format(exerciseID)
+        headerMap["f{0}_DES".format(exerciseID)] = "{0}_DES".format(exerciseID)
+        headerMap["f{0}_grade".format(exerciseID)] = "{0}_grade".format(exerciseID)
+        headerMap["f{0}_DT".format(exerciseID)] = "{0}_D/T".format(exerciseID)
+        headerMap["student_id"] = "student_id"
+        headerMap["f{0}_timestamp".format(exerciseID)] = "{0}_timestamp".format(exerciseID)
+        self.headerMap = headerMap
+
+    def analyze (self, submissions, firstTimestamp, path):
+        submissions = self.fixTimestamps(submissions)
+        resultList = []
+        submissionsWithFiles = []
+        for s in submissions:
+            content = readSubmissionContent(path, self.exerciseID, s.submission_id)
+            if len(content) > 0:
+                submissionsWithFiles.append((s,content))
+        assert len(submissionsWithFiles) >= 1, "No valid code submitted to exercise {} from student {}".format(self.exerciseID,submissions[0].user_id)
+        total = len(submissionsWithFiles)
+        firstTuple = submissionsWithFiles[0]
+        first = firstTuple[0]
+        firstTES = first.time - firstTimestamp
+        firstContent = firstTuple[1]
+        firstDES = levenshteinDistance("", firstContent)
+        firstDT = 0 if firstTES == 0 or firstDES == 0 else firstDES/firstTES
+        resultList.append(self.SubmissionData(first.user_id, firstTES, firstDES, first.grade, firstDT, first.time))
+        for i in range(1, total):
+            subTuple = submissionsWithFiles[i]
+            sub = subTuple[0]
+            tes = sub.time - submissionsWithFiles[i-1][0].time
+            subContent = subTuple[1]
+            prevContent = submissionsWithFiles[i-1][1]
+            des = levenshteinDistance(prevContent,subContent)
+            dt = 0 if tes == 0 or des == 0 else des/tes
+            resultList.append(self.SubmissionData(sub.user_id, tes, des, sub.grade, dt, sub.time))
+        return (self.exerciseID, resultList)
+
+    def fixTimestamps (self, submissions):
+        sameTS = list()
+        repeated = 0
+        for i in range(1,len(submissions)):
+            prev = i - 1
+            if submissions[prev].time == submissions[i].time:
+                sameTS.append(i)
+            else:
+                if len(sameTS) > 0:
+                    repeated += len(sameTS)
+                    self.spreadTSEvenly(sameTS, submissions)
+                    sameTS = list()
+                continue
+        if len(sameTS) > 0:
+            repeated += len(sameTS)
+            self.spreadTSEvenly(sameTS, submissions)
+        # we need to sort
+        submissions.sort(key=lambda x: x.time)
+        if repeated > 0:
+            sub = submissions[0]
+            print("{0} repeated {1} TS for exercise {2}".format(sub.user_id,repeated,self.exerciseID))
+        return submissions
+
+    def spreadTSEvenly (self, indexes, submissions):
+        print("repeated ts")
+        if len(indexes) == 1:
+            sub = submissions[indexes[0]]
+            submissions[indexes[0]] = sub._replace(time=sub.time + 30)
+        else:
+            if len(indexes) > 2:
+                print("We have a problem...")
+            first = indexes[0]
+            sub = submissions[first]
+            submissions[first] = sub._replace(time=sub.time + 20)
+            for i in range(1, len(indexes)):
+                current = indexes[i]
+                prev = indexes[i-1]
+                sub = submissions[current]
+                submissions[current] = sub._replace(time=submissions[prev].time + 20)
+
+
+    def addData (self, submissionData):
+        self.data.append(submissionData)
+
+    def saveToCSV (self, folder, dataset):
+        with open("{}/{}.csv".format(folder, self.exerciseID),"w", encoding='utf-8') as file:
+            fields = self.SubmissionData._fields
+            translatedFields = [self.headerMap[x] for x in fields]
+            header = ",".join(translatedFields)
+            file.write(header)
+            file.write('\n')
+            for data in dataset:
+                lineData = [getattr(data, x) for x in fields]
+                line = ",".join(str(e) for e in lineData)
+                file.write(line)
+                file.write('\n')
+            file.close()
+
+
+

+ 98 - 0
submissionFileReader.py

@@ -0,0 +1,98 @@
+import sys
+from math import trunc
+import os
+import tempfile
+import glob
+from ccleaner import CodeCleaner
+from pycleaner import CodeCleaner as PyCleaner
+sys.path.extend(['.', '..'])
+import re
+import subprocess
+import numpy as np
+
+USEFUL_REGEX = r"(//.*)|(/\*[\w\W\n\r]*?\*/)|(^\s*$)|(\{\s*\})|(^\s*\{\s*$)|(^\s*\}\s*$)"
+FILE_REGEX = re.compile(r".*\.(py|c)$",re.IGNORECASE)
+#VAR_NO_INIT = re.compile(r"(int|float|double|char|long|string)\s*\*?([a-zA-Z0-9_]+)\s*(,\s*[a-zA-Z0-9)]+\s*)*;")
+#VAR_INIT = re.compile(r"(int|float|double|char|long|string)\s*\*?([a-zA-Z0-9_]+)[^=]*=\s*[^,](,\s*[a-zA-Z0-9)][^=]*=\s*[^,])*;")
+
+def readSubmissionContent (path, exerciseID, submissionID):
+    filePath = "%s%s/%s/submittedfiles/*.*"%(path, trunc(exerciseID), trunc(submissionID))
+    files = glob.glob(filePath)
+    files = [f for f in files if FILE_REGEX.match(f) != None]
+    #assert len(files) > 0, "Evaluated code for exercise {}, submission {} doesn't have a single submitted .c file at {}".format(exerciseID, submissionID, filePath)
+    if len(files) == 0:
+        print("Evaluated code for exercise {}, submission {} doesn't have a single submitted .c/.py file at {}".format(exerciseID, submissionID, filePath))
+        return ""
+    # check type
+    file = files[0]
+    content = open(file, "r").read()
+    try:
+        if file.endswith("py") :
+            #python
+            return minifyPython(content)
+        else:
+            #c code
+            return minifyC(content)
+    except Exception as e:
+        print(e)
+        return ""
+
+def minifyC (text):
+    usefulText = CodeCleaner().cleanCode(text)
+    #print(usefulText)
+    out = ""
+    with tempfile.NamedTemporaryFile() as fp:
+        fp.write(usefulText.encode("utf-8"))
+        fp.flush()
+        process = subprocess.run("./cminify.sh {}".format(fp.name),shell=True, capture_output=True, text=True)
+        out = process.stdout
+        fp.close()
+    return out
+
+def minifyPython (text):
+    usefulText = PyCleaner().cleanCode(text)
+    return usefulText
+
+def getSubmissionFile (path, exerciseID, submissionID):
+    filePath = "%s%s/%s/submittedfiles/*.*"%(path, trunc(exerciseID), trunc(submissionID))
+    files = glob.glob(filePath)
+    files = [f for f in files if FILE_REGEX.match(f) != None]
+    assert len(files) > 0, "Evaluated code for exercise {}, submission {} doesn't have a single submitted .c file at {}".format(exerciseID, submissionID, filePath)
+    return files[0]
+
+def getUsefulLines (path, exerciseID, submissionID):
+    subFile = getSubmissionFile(path, exerciseID, submissionID)
+    usefulLines = 0
+    with open(subFile, "r") as fileHandler:
+        content = fileHandler.read()
+        fileHandler.close()
+        usefulText = re.sub(USEFUL_REGEX, "", content, flags=re.MULTILINE)
+        usefulText = os.linesep.join([s for s in usefulText.splitlines() if s])
+        usefulLines = usefulText.count("\n")
+    return usefulLines
+
+def levenshteinDistance (seq1, seq2):
+    size_x = len(seq1) + 1
+    size_y = len(seq2) + 1
+    matrix = np.zeros ((size_x, size_y))
+    for x in range(size_x):
+        matrix [x, 0] = x
+    for y in range(size_y):
+        matrix [0, y] = y
+
+    for x in range(1, size_x):
+        for y in range(1, size_y):
+            if seq1[x-1] == seq2[y-1]:
+                matrix [x,y] = min(
+                    matrix[x-1, y] + 1,
+                    matrix[x-1, y-1],
+                    matrix[x, y-1] + 1
+                )
+            else:
+                matrix [x,y] = min(
+                    matrix[x-1,y] + 1,
+                    matrix[x-1,y-1] + 1,
+                    matrix[x,y-1] + 1
+                )
+    return (matrix[size_x - 1, size_y - 1])
+

+ 43 - 0
threadPool.py

@@ -0,0 +1,43 @@
+from queue import Queue
+from threading import Thread
+from joblib import Parallel, delayed
+import multiprocessing as mb
+import traceback
+
+def createParallelPool (func, inputs):
+   result = Parallel(mb.cpu_count()/2)
+   for i in inputs:
+       (task, workload) = i
+       result(delayed(func)(task, workload))
+   return result
+
+class Worker (Thread):
+    """Thread executing tasks from a given tasks queue"""
+    def __init__ (self, tasks):
+        Thread.__init__(self)
+        self.tasks = tasks
+        self.daemon = True
+        self.start()
+
+    def run (self):
+        while True:
+            func, args, kargs = self.tasks.get()
+            try: func(*args, **kargs)
+            except Exception as ex:
+                #traceback.print_exception(type(ex), ex, ex.__traceback__)
+                print(type(ex), ex)
+            self.tasks.task_done()
+
+class ThreadPool:
+    """Pool of threads consuming tasks from a queue"""
+    def __init__ (self, num_threads):
+        self.tasks = Queue(num_threads)
+        for _ in range(num_threads): Worker(self.tasks)
+
+    def add_task (self, func, *args, **kargs):
+        """Add a task to the queue"""
+        self.tasks.put((func, args, kargs))
+
+    def wait_completion (self):
+        """Wait for completion of all the tasks in the queue"""
+        self.tasks.join()

+ 94 - 0
vplAnalyzer.py

@@ -0,0 +1,94 @@
+from csvParser import CSVParser
+import sys
+import traceback
+from submissionAnalysis import SubmissionAnalysis
+from submissionFileReader import getUsefulLines
+from joblib import Parallel, delayed
+
+def processSubmission (subAnalisys:SubmissionAnalysis, workload:'tuple[int,list,int,str]'):
+    (studentID, submissions, firstTimestamp, path) = workload
+    if firstTimestamp < 0 or len(submissions) == 0:
+        print("No valid submission for exercise {} from student {}".format(subAnalisys.exerciseID, studentID))
+        return None
+    try:
+        result = subAnalisys.analyze(submissions, firstTimestamp, path)
+        return result
+    except Exception as e:
+        print(e)
+        traceback.print_exc()
+        return None
+
+def countUsefulLines (subAnalysis:SubmissionAnalysis, workload):
+    (studentID, submissions, path) = workload
+    if len(submissions) == 0:
+        print("No valid submission for exercise {} from student {}".format(subAnalysis.exerciseID, studentID))
+        return None
+    try:
+        count = getUsefulLines(path,subAnalysis.exerciseID,submissions[-1].submission_id)
+        return (subAnalysis.exerciseID, studentID, count)
+    except Exception as e:
+        print(e)
+        traceback.print_exc()
+        return None
+
+
+def add_task (pool, task, workload):
+    pool.append((task, workload))
+
+def bootstrap (csvPath, vplFolder, outputFolder):
+    parser = CSVParser(csvPath)
+    pool = []
+    tasks = {}
+    for e in parser.exercises:
+        (submissions, students) = parser.getSubmissions(e)
+        studentsSub = [ (s, parser.getStudentValidSubmissions(submissions, s), parser.getStudentFirstInteraction(submissions, s), vplFolder) for s in students]
+        task = SubmissionAnalysis(e)
+        tasks[e] = task
+        for workload in studentsSub:
+            add_task(pool, task, workload)
+    result = Parallel(8)(delayed(processSubmission)(t,w) for (t,w) in pool)
+    exerciseMap = {}
+    for r in result:
+        if r == None:
+            continue
+        (e, data) = r
+        if e in exerciseMap:
+            exerciseMap[e].extend(data)
+        else:
+            exerciseMap[e] = data
+    for e in exerciseMap:
+        task = tasks[e]
+        task.saveToCSV(outputFolder, exerciseMap[e])
+
+def checkUsefulLines (csvPath, vplFolder, _):
+    parser = CSVParser(csvPath)
+    pool = []
+    for e in parser.exercises:
+        if e not in [5035,4988]:
+            continue
+        (submissions, students) = parser.getSubmissions(e)
+        studentsSub = [ (s, parser.getStudentLastSubmission(submissions, s), vplFolder) for s in students]
+        task = SubmissionAnalysis(e)
+        for workload in studentsSub:
+            add_task(pool, task, workload)
+    result = Parallel(8)(delayed(countUsefulLines)(t,w) for (t,w) in pool)
+    output = ''
+    for r in result:
+        if r == None:
+            continue
+        (exercise, student, count) = r
+        output += f'{exercise},{student},{count}\n'
+    with open("useful_count.csv",'w') as fileHandler:
+        fileHandler.write(output)
+        fileHandler.close()
+
+
+CHECK_LINES = True
+#--- run ---#
+if __name__ == "__main__":
+    assert len(sys.argv) == 4, "You must provide the following: path to the csv, the folder with vpl data and the output folder path respectively"
+    if not CHECK_LINES:
+        bootstrap(sys.argv[1], sys.argv[2], sys.argv[3])
+    else:
+        checkUsefulLines(sys.argv[1], sys.argv[2], sys.argv[3])
+