From aca588fa0900ccdf7c4612e035b469efbb2b5dca Mon Sep 17 00:00:00 2001 From: bmaz Date: Wed, 7 Jul 2021 15:41:13 +0200 Subject: [PATCH] deal with nb of columns / missing columns in --resume. close #98 --- gazouilloire/exports/export_csv.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/gazouilloire/exports/export_csv.py b/gazouilloire/exports/export_csv.py index 3187d60..07d4687 100644 --- a/gazouilloire/exports/export_csv.py +++ b/gazouilloire/exports/export_csv.py @@ -12,6 +12,7 @@ from twitwi.constants import TWEET_FIELDS from gazouilloire.config_format import log from casanova import reverse_reader +from casanova.exceptions import MissingColumnError def date_to_timestamp(date): return str(date.timestamp()) @@ -126,14 +127,14 @@ def call_database(conf): def find_potential_duplicate_ids(outputfile): - """ - if there is no timestamp in the initial file, error from the beginning? - what if the 2 files do not have the same number of columns? - @param outputfile: - @return: last_timestamp, last_ids - """ last_ids = set() - last_time = reverse_reader.last_cell(outputfile, 'local_time') + try: + last_time = reverse_reader.last_cell(outputfile, 'local_time') + except MissingColumnError: + log.error("A 'local_time' column is missing in file {} in order to use the --resume/-r option".format( + outputfile + )) + sys.exit(1) with open(outputfile, "r") as f: rev_reader = reverse_reader(f) for row in rev_reader: @@ -154,6 +155,16 @@ def export_csv(conf, query, exclude_threads, exclude_retweets, since, until, .format(field)) else: SELECTION = TWEET_FIELDS + if resume: + with open(outputfile, "r") as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + if sorted(fieldnames) == sorted(SELECTION): + SELECTION = fieldnames + else: + log.error("The column names in the {} file do not match the export format".format(outputfile)) + sys.exit(1) + db = call_database(conf)