Browse Source

Fix duplicate bugs

Lucas de Souza 5 years ago
parent
commit
f071217168
6 changed files with 70 additions and 11 deletions
  1. 8 1
      .gitignore
  2. 0 0
      bak/.gitkeep
  3. 3 3
      csvconfig.txt
  4. 0 0
      csvs/.gitkeep
  5. 11 0
      helpers.py
  6. 48 7
      main.py

+ 8 - 1
.gitignore

@@ -1,3 +1,10 @@
 __pycache__/
 .vscode
-**/*/__pycache__/
+**/*/__pycache__/
+bak/*
+!bak/.gitkeep
+csvs/*
+!csvs/.gitkeep
+.csvstate
+accepted.csv
+rejected.csv

+ 0 - 0
bak/.gitkeep


+ 3 - 3
csvconfig.txt

@@ -1,4 +1,4 @@
 csvs/export2019.04.04-15.12.40.csv:ieee:,
-csvs/scopus.csv:scopus:,
-csvs/wos1-500.txt:scopus:\t
-csvs/wos501-532.txt:scopus:\t
+csvs/wos1-500.txt:webscience:\t
+csvs/wos501-532.txt:webscience:\t
+csvs/scopus.csv:scopus:,

+ 0 - 0
csvs/.gitkeep


+ 11 - 0
helpers.py

@@ -1,6 +1,8 @@
 import csv
 import sys
 import os
+import shutil
+import datetime
 import tkinter as tk
 from tkinter import messagebox
 import hashlib
@@ -17,6 +19,7 @@ def loadFiles (files):
   return loadedFiles
 
 def saveAcceptedCSV (data):
+  doBackup('accepted.csv')
   file = open('./accepted.csv', mode='w')
   headers = data['header']
   writer = csv.writer(file, delimiter=',')
@@ -24,10 +27,12 @@ def saveAcceptedCSV (data):
   data.pop('header')
   for row in data.values():
     writer.writerow(row)
+  data['header'] = headers
   file.flush()
   file.close()
 
 def saveRejectedCSV (data):
+  doBackup('rejected.csv')
   file = open('./rejected.csv', mode='w')
   headers = data['header']
   writer = csv.writer(file, delimiter=',')
@@ -35,6 +40,7 @@ def saveRejectedCSV (data):
   data.pop('header')
   for row in data.values():
     writer.writerow(row)
+  data['header'] = headers
   file.flush()
   file.close()
 
@@ -51,10 +57,15 @@ def removeStateFile ():
     pass
 
 def saveState (data):
+  doBackup(".csvstate")
   file = open('./.csvstate', mode='wb')
   pickle.dump(data, file)
   file.close()
 
+def doBackup (fileName):
+  filePath = './bak/{0}'.format(fileName)
+  shutil.copy('./{0}'.format(fileName), "{0}.{1}.bak".format(filePath, datetime.datetime.now()))
+
 def loadState (root):
   try:
     file = open('./.csvstate', mode='rb')

+ 48 - 7
main.py

@@ -12,6 +12,7 @@ csvFiles = None
 state = None
 acceptedDataCSV = None
 rejectedDataCSV = None
+nextCount = 0
 
 def findDuplicate (hashDict, title):
   if title in hashDict:
@@ -27,7 +28,7 @@ def acceptPaper (file, csvRow, index):
   digest = hashlib.sha256(str.encode(fileFilter.getTitle(csvRow))).hexdigest()
   duplicate = findDuplicate(rejectedDataCSV, digest) or findDuplicate(acceptedDataCSV, digest)
   if duplicate:
-    state['duplihttps://mail.google.com/mail/u/0/#inboxcate'] += 1
+    state['duplicate'] += 1
   else:
     data.append("{0} - {1}".format(file['path'],index))
     acceptedDataCSV[digest] = data
@@ -69,7 +70,6 @@ def acceptButtonHandler (*arg):
   nextPaper()
   return
 
-
 def updateAndClear (textField, *args):
   textField.config(state=NORMAL)
   textField.delete('1.0', END)
@@ -95,7 +95,6 @@ def loadFiles ():
   if len(rejectedDataCSV) == 0:
     rejectedDataCSV['header'] = helpers.FINAL_CSV_HEADERS
   csvFiles = [(list(f[0]), f[1]) for f in helpers.loadFiles(filesInfo)]
-  setPaperFromState()
 
 def setPaperFromState ():
   global titleLabel
@@ -121,6 +120,7 @@ def nextPaper ():
   global titleLabel
   global text
   global root
+  global nextCount
   fileIndex = state['file_index']
   if fileIndex >= len(filesInfo):
     helpers.removeStateFile()
@@ -136,6 +136,9 @@ def nextPaper ():
     state['row_index'] = -1
     nextPaper()
   else:
+    nextCount += 1
+    if nextCount%10 == 0:
+      saveCurrentData()
     file = fileList[index]
     fileFilter = helpers.getFilter(csvFiles[fileIndex][1])
     title = fileFilter.getTitle(file)
@@ -144,15 +147,47 @@ def nextPaper ():
     text.set(abstract)
 
 def onClosing ():
-  global state
   global root
+  saveCurrentData()
+  root.destroy()
+
+def saveCurrentData ():
+  global state
   global acceptedDataCSV
   global rejectedDataCSV
   helpers.saveState(state)
   helpers.saveAcceptedCSV(acceptedDataCSV)
   helpers.saveRejectedCSV(rejectedDataCSV)
-  root.destroy()
 
+def updateDup ():
+  global acceptedDataCSV
+  global rejectedDataCSV
+  global state
+  tempA = {}
+  tempR = {}
+  dataR = list(rejectedDataCSV.values())
+  for i in range(1,len(dataR)):
+    digest = hashlib.sha256(str.encode(dataR[i][0].lower())).hexdigest()
+    if digest in tempR:
+      print("Found duplicate in rejected: {0}".format(dataR[i][0]))
+      state['duplicate'] += 1
+    else:
+      tempR[digest] = dataR[i]
+  dataA = list(acceptedDataCSV.values())
+  for i in range(1, len(dataA)):
+    digest = hashlib.sha256(str.encode(dataA[i][0].lower())).hexdigest()
+    if digest in tempA or digest in tempR:
+      print("Found duplicate in accepted: {0}".format(dataA[i][0]))
+      state['duplicate'] += 1
+    else:
+      tempA[digest] = dataA[i]
+  state['accepted'] = len(tempA)
+  state['rejected'] = len(tempR)
+  tempA['header'] = dataA[0]
+  tempR['header'] = dataR[0]
+  acceptedDataCSV = tempA
+  rejectedDataCSV = tempR
+  saveCurrentData()
 
 def main ():
   global root
@@ -177,7 +212,13 @@ def main ():
   text.trace('w', lambda *arg: updateAndClear(textField))
   textField.config(state=DISABLED)
   root.protocol("WM_DELETE_WINDOW", onClosing)
+  setPaperFromState()
+  #updateDup()
+  print("Duplicate count {0}".format(state['duplicate']))
+  print("Accepted count {0}".format(state['accepted']))
+  print("Rejected count {0}".format(state['rejected']))
   root.mainloop()
 
-loadFiles()
-main()
+if __name__ == '__main__':
+  loadFiles()
+  main()