Coverage for src/csvdiff2/csvdiff.py : 96%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python2
2# -*- coding: utf-8 -*-
3import abc
4import csv
5import functools
6import logging
7import os
8import sys
9import timeit
10from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
13# ----------------------------------------------------------------------------------------------------------------------
14# Decorators
15# ----------------------------------------------------------------------------------------------------------------------
17def show_execution_time():
19 def _execution_time(func):
21 def wrapper(*args, **kwargs):
23 start = timeit.default_timer()
25 func(*args, **kwargs)
27 elapsed_time = timeit.default_timer() - start
28 print
29 print("elapsed_time={0}".format(elapsed_time) + "[sec]")
30 print
32 return wrapper
34 return _execution_time
37def spacing_before(number_of_lines):
39 number_of_lines = number_of_lines or 1
41 def _spacing_before(func):
43 def wrapper(*args, **kwargs):
45 for i in range(number_of_lines):
46 print('')
48 func(*args, **kwargs)
50 return wrapper
52 return _spacing_before
55# ----------------------------------------------------------------------------------------------------------------------
56# Entrance
57# ----------------------------------------------------------------------------------------------------------------------
59# @show_execution_time()
60def main():
62 configure()
64 context = context_from_arguments()
65 show_context_for_debugging(context)
67 try:
68 run_in(context)
69 except IndexError as e:
70 logger.error('It is possible that the number of columns in the row is not aligned. Please check the csv data. If not, please file an issue. [{}, description={}]'.format(type(e), e))
71 sys.exit(1)
74class App(type):
76 VERSION = '1.0.0'
79class LoggingConfig(type):
81 # If you want to debug, play with the CONSOLE_LEVEL or FILE_LEVEL.
83 BASE_LEVEL = logging.DEBUG
85 CONSOLE_LEVEL = logging.ERROR
86 CONSOLE_FORMAT = '%(levelname)s: %(message)s'
88 FILE_LEVEL = logging.WARNING
89 FILE_FORMAT = '%(asctime)s: %(levelname)s: %(message)s'
90 FILE_PATH = 'csvdiff.log'
93logger = logging.getLogger(__name__)
96def configure():
98 logging.basicConfig(level=LoggingConfig.BASE_LEVEL)
100 stream_handler = logging.StreamHandler()
101 stream_handler.setLevel(LoggingConfig.CONSOLE_LEVEL)
102 stream_handler.setFormatter(logging.Formatter(LoggingConfig.CONSOLE_FORMAT))
104 file_handler = logging.FileHandler(filename=LoggingConfig.FILE_PATH, mode='w')
105 file_handler.setLevel(LoggingConfig.FILE_LEVEL)
106 file_handler.setFormatter(logging.Formatter(LoggingConfig.FILE_FORMAT))
108 logger.addHandler(stream_handler)
109 logger.addHandler(file_handler)
111 logger.propagate = False
114# ----------------------------------------------------------------------------------------------------------------------
115# Context Preparation
116# ----------------------------------------------------------------------------------------------------------------------
118def context_from_arguments():
120 def arg_type_matching_key_in_csv(x):
121 return map(MatchingKeyInfo, x.split(','))
123 def arg_type_int_in_csv(x):
124 return map(int, x.split(','))
126 parser = ArgumentParser(prog='csv-diff-python2@blue-monk', formatter_class=ArgumentDefaultsHelpFormatter)
128 # Program name & Version -------------------------------------------------------------------------------------------
129 parser.add_argument('--version', action='version', version='%(prog)s {}'.format(App.VERSION))
131 # Input CSV file paths ---------------------------------------------------------------------------------------------
132 parser.add_argument('lhs_file_name', type=str, help='Absolute/Relative path to left-hand side file.')
133 parser.add_argument('rhs_file_name', type=str, help='Absolute/Relative path to right-hand side file.')
135 # Matching conditions ----------------------------------------------------------------------------------------------
136 parser.add_argument('-k', '--matching-keys', type=arg_type_matching_key_in_csv, default='0',
137 help='Matching key indices(from 0) for Input CSV in CSV format. For non-fixed length numbers, specify the number of digits after ":". e.g.: 0:8,3')
138 parser.add_argument('-u', '--unique-key', default=False, action='store_true',
139 help="Specify if the matching key is unique. Then, if it detects that the matching key is not unique, an error will occur.")
140 parser.add_argument('-i', '--ignore-columns', type=arg_type_int_in_csv, default=[],
141 help='Specify the index of the column to be ignored in CSV format. e.g.: 3,7')
143 # Report styles ----------------------------------------------------------------------------------------------------
144 parser.add_argument('-v', '--vertical-style', default=False, action='store_true',
145 help='Report in vertical style. If not specified, report in horizontal(two facing) style.')
147 parser.add_argument('-c', '--show-count', default=False, action='store_true',
148 help='Report the number of differences. Treat this as True if neither -d nor -a is specified.')
150 display_group = parser.add_mutually_exclusive_group()
151 display_group.add_argument('-d', '--show-difference-only', default=False, action='store_true',
152 help='Report the lines with the difference. Can be used with option -c. Cannot be used with option -a.')
153 display_group.add_argument('-a', '--show-all-lines', action='store_true',
154 help='Report on all lines. Can be used with option -c. Cannot be used with option -d.')
156 parser.add_argument('-x', '--show-context-from-arguments', default=False, action='store_true',
157 help='Report the context generated from the arguments and CSV sniffing.')
159 # CSV analysis conditions ------------------------------------------------------------------------------------------
160 parser.add_argument('-H', '--header', type=str, default=None, choices=['n', 'y'],
161 help='If specified, this specification will be enforced.')
163 parser.add_argument('-S', '--sniffing-size', type=str, default=4096,
164 help="If csv sniffing fails, try specifying a size larger than 4096. Or Explicitly specify CSV file conditions like '--column-separator-for-lhs TAB'. Check help with -h option.")
166 parser.add_argument('-F', '--force-individual-specs', action='store_true',
167 help="If you don't want to rely on csv sniffing, specify it, and then specify --column-separator and so on separately.")
169 parser.add_argument('--column-separator', type=str, default=None, choices=['COMMA', 'TAB', 'SEMICOLON'],
170 help='Process both sides CSV file using the specified column delimiter.')
172 parser.add_argument('--line-separator', type=str, default=None, choices=['LF', 'CRLF'],
173 help='Process both sides CSV file using the specified line separator.')
175 parser.add_argument('--quote-char', type=str, default=None, choices=['"', "'"],
176 help='Process both sides CSV file using the specified quote character.')
178 parser.add_argument('--no-skip-space-after-column-separator', action='store_true',
179 help='Specify when you want to treat the space immediately after the separator as data for the both sides CSV file.')
181 # CSV analysis conditions by left and right ------------------------------------------------------------------------
182 parser.add_argument('--column-separator-for-lhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],
183 help='Process left-hand side CSV file using the specified column delimiter.')
185 parser.add_argument('--column-separator-for-rhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],
186 help='Process right-hand side CSV file using the specified column delimiter.')
188 parser.add_argument('--line-separator-for-lhs', type=str, default="LF", choices=['LF', 'CRLF'],
189 help='Process left-hand side CSV file using the specified line separator.')
191 parser.add_argument('--line-separator-for-rhs', type=str, default="LF", choices=['LF', 'CRLF'],
192 help='Process right-hand side CSV file using the specified line separator.')
194 parser.add_argument('--quote-char-for-lhs', type=str, default='"', choices=['"', "'"],
195 help='Process left-hand side CSV file using the specified quote character.')
197 parser.add_argument('--quote-char-for-rhs', type=str, default='"', choices=['"', "'"],
198 help='Process right-hand side CSV file using the specified quote character.')
200 parser.add_argument('--no-skip-space-after-column-separator-for-lhs', default=False, action='store_true',
201 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the left side.')
203 parser.add_argument('--no-skip-space-after-column-separator-for-rhs', default=False, action='store_true',
204 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the right side.')
206 # ------------------------------------------------------------------------------------------------------------------
208 return Context(parser.parse_args())
211class Context:
213 LINE_SEPARATOR_s = {
214 "CR": '\r',
215 "LF": '\n',
216 "CRLF": '\r\n',
217 None: '<None>',
218 }
220 COLUMN_SEPARATOR_s = {
221 "COMMA": ',',
222 "TAB": '\t',
223 "SEMICOLON": ';',
224 None: '<None>',
225 }
227 def __init__(self, args):
229 # Input CSV file paths ---------------------------------------------------------------------------------------------
230 self.lhs_file_name = args.lhs_file_name
231 self.rhs_file_name = args.rhs_file_name
232 self.lhs_file_path = os.path.abspath(args.lhs_file_name)
233 self.rhs_file_path = os.path.abspath(args.rhs_file_name)
235 # Matching conditions ----------------------------------------------------------------------------------------------
236 self.matching_key_codec = MatchingKeyCodec(args.matching_keys)
237 self.key_should_be_unique = args.unique_key
238 self.column_indices_to_ignore = args.ignore_columns
240 # Report styles ----------------------------------------------------------------------------------------------------
241 self.reports_in_vertical_style = args.vertical_style
242 self.reports_in_horizontal_style = not args.vertical_style
244 self.shows_count = args.show_count
245 self.shows_difference_only = args.show_difference_only
246 self.shows_all_lines = args.show_all_lines
247 self.shows_details = True if self.shows_difference_only or self.shows_all_lines else False
248 self.shows_context_from_arguments = args.show_context_from_arguments
250 self.needs_size_info_for_padding = self.shows_details and self.reports_in_horizontal_style
252 # CSV analysis conditions ------------------------------------------------------------------------------------------
253 self.header = args.header
254 self.first_row_is_header = None
256 self.sniffing_size = args.sniffing_size
258 self.forces_individual_specs = args.force_individual_specs
260 if self.forces_individual_specs and args.column_separator:
261 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator]
262 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator]
263 else:
264 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_lhs]
265 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_rhs]
267 if self.forces_individual_specs and args.line_separator:
268 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator]
269 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator]
270 else:
271 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator_for_lhs]
272 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator_for_rhs]
274 if self.forces_individual_specs and args.quote_char:
275 self.quote_char_for_lhs = args.quote_char
276 self.quote_char_for_rhs = args.quote_char
277 else:
278 self.quote_char_for_lhs = args.quote_char_for_lhs
279 self.quote_char_for_rhs = args.quote_char_for_rhs
281 if self.forces_individual_specs and args.no_skip_space_after_column_separator:
282 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator
283 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator
284 else:
285 self.skips_space_after_column_separator_for_lhs = True
286 self.skips_space_after_column_separator_for_rhs = True
288 self._validate()
289 self._normalize()
291 def _validate(self):
293 if not os.path.exists(self.lhs_file_path):
294 logger.error('lhs_file_path not exists. [lhs_file_path={}]'.format(self.lhs_file_path))
295 sys.exit(1)
296 if not os.path.exists(self.rhs_file_path):
297 logger.error('rhs_file_path not exists. [rhs_file_path={}]'.format(self.rhs_file_path))
298 sys.exit(1)
300 if not os.path.isfile(self.lhs_file_path):
301 logger.error('lhs_file_path is not a file. [lhs_file_path={}]'.format(self.lhs_file_path))
302 sys.exit(1)
303 if not os.path.isfile(self.rhs_file_path):
304 logger.error('rhs_file_path is not a file. [rhs_file_path={}]'.format(self.rhs_file_path))
305 sys.exit(1)
307 def _normalize(self):
309 if not any([self.shows_count, self.shows_difference_only, self.shows_all_lines]):
310 self.shows_count = True
312 def display_string_for_column_separator(self, value):
314 candidates = [k for k, v in self.COLUMN_SEPARATOR_s.items() if v == value]
315 if candidates:
316 return candidates[0]
317 else:
318 'undefined({})'.format(value)
321def show_context_for_debugging(cxt):
323 logger.debug('lhs_file_name={}'.format(cxt.lhs_file_name))
324 logger.debug('rhs_file_name={}'.format(cxt.rhs_file_name))
325 logger.debug('lhs_file_path={}'.format(cxt.lhs_file_path))
326 logger.debug('rhs_file_path={}'.format(cxt.rhs_file_path))
328 logger.debug('matching_key_codec={}'.format(cxt.matching_key_codec))
329 logger.debug('key_should_be_unique={}'.format(cxt.key_should_be_unique))
330 logger.debug('column_indices_to_ignore={}'.format(cxt.column_indices_to_ignore))
332 logger.debug('reports_in_vertical_style={}'.format(cxt.reports_in_vertical_style))
333 logger.debug('reports_in_horizontal_style={}'.format(cxt.reports_in_horizontal_style))
334 logger.debug('shows_count={}'.format(cxt.shows_count))
335 logger.debug('shows_difference_only={}'.format(cxt.shows_difference_only))
336 logger.debug('shows_all_lines={}'.format(cxt.shows_all_lines))
337 logger.debug('shows_context_from_arguments={}'.format(cxt.shows_context_from_arguments))
338 logger.debug('needs_size_info_for_padding={}'.format(cxt.needs_size_info_for_padding))
340 logger.debug('first_row_is_header={}'.format(cxt.first_row_is_header))
341 logger.debug('sniffing_size={}'.format(cxt.sniffing_size))
342 logger.debug('force_individual_specs={}'.format(cxt.forces_individual_specs))
344 logger.debug('column_separator_for_lhs={}'.format(cxt.display_string_for_column_separator(cxt.column_separator_for_lhs)))
345 logger.debug('column_separator_for_rhs={}'.format(cxt.display_string_for_column_separator(cxt.column_separator_for_rhs)))
346 logger.debug('line_separator_for_lhs={}'.format(cxt.line_separator_for_lhs.encode('hex')))
347 logger.debug('line_separator_for_rhs={}'.format(cxt.line_separator_for_rhs.encode('hex')))
348 logger.debug('quote_char_for_lhs={}'.format(cxt.quote_char_for_lhs))
349 logger.debug('quote_char_for_rhs={}'.format(cxt.quote_char_for_rhs))
350 logger.debug('skips_space_after_column_separator_for_lhs={}'.format(cxt.skips_space_after_column_separator_for_lhs))
351 logger.debug('skips_space_after_column_separator_for_rhs={}'.format(cxt.skips_space_after_column_separator_for_rhs))
353 logger.debug('MatchingKeyCodec#END_of_KEY={}'.format(MatchingKeyCodec.END_of_KEY))
356# ----------------------------------------------------------------------------------------------------------------------
357# Matching Key Treatment
358# ----------------------------------------------------------------------------------------------------------------------
360class MatchingKeyInfo:
362 def __init__(self, specified_string):
364 elements = filter(lambda x: x != '', specified_string.split(':'))
366 index = elements.pop(0)
367 self.index = self._transform_into_numeric(index, 'index')
369 max_length = elements.pop(0) if elements else '0'
370 self.max_length = self._transform_into_numeric(max_length, 'max_length')
372 def __repr__(self):
373 return '{}({!r}, {!r})'.format(self.__class__.__name__, self.index, self.max_length if self.max_length > 0 else '<not specified>')
375 @classmethod
376 def _transform_into_numeric(cls, value, name):
378 if not value.isdigit():
379 logger.error('MATCHING_KEY_INDICES should be a number. See also help. [specified {}={}]'.format(name, value))
380 exit(1)
382 return int(value)
384 def key_for(self, row):
385 return row[self.index].rjust(self.max_length, '0')
388class MatchingKeyCodec:
390 END_of_KEY = 'ZZZ'
391 SEPARATOR = '..'
393 def __init__(self, matching_key_info_list):
394 self.matching_key_info_list = matching_key_info_list
396 def __repr__(self):
397 return '{}({!r})'.format(self.__class__.__name__, self.matching_key_info_list)
399 def managed_key_for(self, row):
401 try:
402 return functools.reduce(lambda making, matching_key: making + matching_key.key_for(row) + self.SEPARATOR,
403 self.matching_key_info_list, self.SEPARATOR)
404 except IndexError:
405 logger.error('one of the indices specified for MATCHING_KEY_INDICES is out of range [MATCHING_KEY_INDICES={}, number of columns = {}, row={}]'.format(self.matching_key_info_list, len(row), row))
406 exit(1)
408 @property
409 def matching_key_indices(self):
410 return map(lambda matching_key_info: matching_key_info.index, self.matching_key_info_list)
412 @classmethod
413 def decode_key(cls, key):
414 """ Leave the padding as it is. """
415 return key.strip(cls.SEPARATOR).split(cls.SEPARATOR)
418# ----------------------------------------------------------------------------------------------------------------------
419# Control and Determine if it exists only on the left, only on the right, or both
420# ----------------------------------------------------------------------------------------------------------------------
422def run_in(context):
424 with open(context.lhs_file_path, mode='r') as lhs_csv,\
425 open(context.rhs_file_path, mode='r') as rhs_csv:
427 lhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(context, lhs_csv, FileArrangement.LHS)
428 rhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(adjusted_context, rhs_csv, FileArrangement.RHS)
430 csv_reader = CsvReader(lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, adjusted_context)
431 pre_scan_result = PreScanner.scan(adjusted_context, csv_reader)
432 csv_reader.reset()
434 detect_diff(adjusted_context, csv_reader, pre_scan_result)
437def detect_diff(context, csv_reader, pre_scan_result):
439 value_difference_detector = ValueDifferenceDetector(pre_scan_result.number_of_columns,
440 context.matching_key_codec.matching_key_indices,
441 context.column_indices_to_ignore)
443 heading_reporter = HeadingReporter(context)
444 detail_reporter = DetailReporter.Factory.reporter_for(context, pre_scan_result)
445 count_reporter = CountReporter(context.shows_count)
446 counter = count_reporter.counter
448 heading_reporter.report_heading()
449 detail_reporter.report_detail_heading()
452 def existed_only_on_lhs(lhs_fact):
453 counter.count_for_case_of_existed_only_on_lhs(lhs_fact.lhs_row_number)
454 detail_reporter.report_case_of_existed_only_on_lhs(lhs_fact)
456 def existed_on_both_sides(lhs_fact, rhs_fact):
457 value_difference_result = value_difference_detector.detect_difference_between(lhs_fact.lhs_row, rhs_fact.rhs_row)
458 counter.count_for_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)
459 detail_reporter.report_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)
461 def existed_only_on_rhs(rhs_fact):
462 counter.count_for_case_of_existed_only_on_rhs(rhs_fact.rhs_row_number)
463 detail_reporter.report_case_of_existed_only_on_rhs(rhs_fact)
465 perform_key_matching(csv_reader, existed_only_on_lhs, existed_on_both_sides, existed_only_on_rhs)
468 count_reporter.report_count()
471def perform_key_matching(csv_reader, callback_for_lhs_only, callback_for_both_sides, callback_for_rhs_only):
473 lhs_fact = csv_reader.read_lhs()
474 rhs_fact = csv_reader.read_rhs()
476 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY or rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:
478 if lhs_fact.lhs_key < rhs_fact.rhs_key:
479 callback_for_lhs_only(lhs_fact)
480 lhs_fact = csv_reader.read_lhs()
482 elif lhs_fact.lhs_key == rhs_fact.rhs_key:
483 callback_for_both_sides(lhs_fact, rhs_fact)
484 lhs_fact = csv_reader.read_lhs()
485 rhs_fact = csv_reader.read_rhs()
487 elif lhs_fact.lhs_key > rhs_fact.rhs_key:
488 callback_for_rhs_only(rhs_fact)
489 rhs_fact = csv_reader.read_rhs()
492# ----------------------------------------------------------------------------------------------------------------------
493# Value-Difference Detection
494# ----------------------------------------------------------------------------------------------------------------------
496class ValueDifferenceDetector:
498 class ValueDifferenceResult:
500 def __init__(self, different_column_indices):
502 self.different_column_indices = different_column_indices
504 @property
505 def has_difference(self):
506 return True if self.different_column_indices else False
509 def __init__(self, number_of_columns, matching_key_indices, ignore_column_indices):
511 self.column_indices = range(0, number_of_columns)
512 logger.debug('column_indices={}'.format(self.column_indices))
514 self.target_column_indices = set(self.column_indices) - set(matching_key_indices) - set(ignore_column_indices)
515 logger.debug('target_column_indices={}'.format(self.target_column_indices))
517 def detect_difference_between(self, lhs_row, rhs_row):
519 different_column_indices = [index for index in self.target_column_indices if lhs_row[index] != rhs_row[index]]
520 logger.debug('different_column_indices={}'.format(different_column_indices))
521 return self.ValueDifferenceResult(different_column_indices)
524# ----------------------------------------------------------------------------------------------------------------------
525# Reporting
526# ----------------------------------------------------------------------------------------------------------------------
528class PreScanner:
530 class ScanResult:
532 def __init__(self, number_of_columns, size_info_for_padding):
533 self.number_of_columns = number_of_columns
534 self.size_info_for_padding = size_info_for_padding
536 @classmethod
537 def for_lightly(cls, number_of_columns):
538 return PreScanner.ScanResult(number_of_columns, None)
540 @classmethod
541 def for_deeply(cls, number_of_columns, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):
542 size_info_for_padding = cls.SizeInfoForPadding(lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)
543 return PreScanner.ScanResult(number_of_columns, size_info_for_padding)
546 class SizeInfoForPadding:
548 def __init__(self, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):
549 self.lhs_max_row_number = lhs_max_row_number
550 self.lhs_max_row_length = lhs_max_row_length
551 self.rhs_max_row_number = rhs_max_row_number
552 self.rhs_max_row_length = rhs_max_row_length
555 def __init__(self):
556 pass
558 @classmethod
559 def scan(cls, context, csv_reader):
561 if context.needs_size_info_for_padding:
562 return PreScanner._scan_deeply(csv_reader)
563 else:
564 return PreScanner._scan_lightly(csv_reader)
567 @classmethod
568 def _scan_deeply(cls, csv_reader):
569 """
570 Notes
571 -----
572 Purpose of deep pre-scanning
573 * Determine the number of columns for value difference detection
574 * Get size information to format the horizontal report
575 """
576 start_ = timeit.default_timer()
578 lhs_max_row_length, rhs_max_row_length = 0, 0
580 lhs_fact = csv_reader.read_lhs()
581 rhs_fact = csv_reader.read_rhs()
583 number_of_columns = cls._determine_number_of_columns_from(lhs_fact, rhs_fact)
585 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY:
586 lhs_max_row_length = max(lhs_max_row_length, len(str(lhs_fact.lhs_row)))
587 lhs_fact = csv_reader.read_lhs()
589 while rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:
590 rhs_max_row_length = max(rhs_max_row_length, len(str(rhs_fact.rhs_row)))
591 rhs_fact = csv_reader.read_rhs()
593 lhs_max_row_number = csv_reader.lhs_csv_state.row_number
594 rhs_max_row_number = csv_reader.rhs_csv_state.row_number
595 logger.debug('lhs_max_row_number={}'.format(lhs_max_row_number))
596 logger.debug('rhs_max_row_number={}'.format(rhs_max_row_number))
598 elapsed_time_ = timeit.default_timer() - start_
599 logger.debug("PreScanner#scan() elapsed_time:{0}".format(elapsed_time_) + "[sec]")
600 return PreScanner.ScanResult.for_deeply(number_of_columns,
601 lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)
603 @classmethod
604 def _scan_lightly(cls, csv_reader):
605 """
606 Notes
607 -----
608 Purpose of light pre-scanning
609 * Determine the number of columns for value difference detection
611 Vertical reports do not require size information for formatting.
612 """
614 lhs_fact = csv_reader.read_lhs()
615 rhs_fact = csv_reader.read_rhs()
617 return PreScanner.ScanResult.for_lightly(cls._determine_number_of_columns_from(lhs_fact, rhs_fact))
619 @classmethod
620 def _determine_number_of_columns_from(cls, lhs_fact, rhs_fact):
622 number_of_columns = 0
623 if lhs_fact.lhs_row:
624 number_of_columns = len(lhs_fact.lhs_row)
625 elif rhs_fact.rhs_row:
626 number_of_columns = len(rhs_fact.rhs_row)
628 return number_of_columns
631class Mark(type):
633 LHS_ONLY = '<'
634 RHS_ONLY = '>'
635 HAS_DIFF = '!'
636 NON_DIFF = ' '
637 NON_DIFF_EXPRESSLY = '='
640class HeadingReporter:
642 def __init__(self, context):
643 self.cxt = context
646 def report_heading(self):
648 self._report_title()
650 if self.cxt.shows_context_from_arguments:
651 self._report_context()
653 @classmethod
654 @spacing_before(1)
655 def _report_title(cls):
656 print('============ Report ============')
658 @spacing_before(1)
659 def _report_context(self):
661 print('* Context')
662 print('File Path on the Left-Hand Side: {}'.format(self.cxt.lhs_file_path))
663 print('File Path on the Right-Hand Side : {}'.format(self.cxt.rhs_file_path))
664 print('Matching Key Indices: {}'.format(self.cxt.matching_key_codec.matching_key_info_list))
665 print('Matching Key Is Unique?: {}'.format(self.cxt.key_should_be_unique))
666 print('Column Indices to Ignore: {}'.format(self.cxt.column_indices_to_ignore))
667 print('with Header?: {}'.format(self.cxt.first_row_is_header))
668 print('Report Style: {}'.format('Vertical' if self.cxt.reports_in_vertical_style else 'Two facing (Horizontal)'))
669 print('Show Count?: {}'.format(self.cxt.shows_count))
670 print('Show Difference Only?: {}'.format(self.cxt.shows_difference_only))
671 print('Show All?: {}'.format(self.cxt.shows_all_lines))
672 print('Show Context?: {}'.format(self.cxt.shows_context_from_arguments))
673 print('CSV Sniffing Size: {}'.format(self.cxt.sniffing_size))
674 print('--- csv analysis conditions ---')
675 print('Forces Individual Specified Conditions?: {}'.format(self.cxt.forces_individual_specs))
676 print('column_separator_for_lhs: {}'.format(self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_lhs))) # DONE: タブのときの表示
677 print('column_separator_for_rhs: {}'.format(self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_rhs))) # DONE: タブのときの表示
678 print('line_separator_for_lhs: {}'.format(self.cxt.line_separator_for_lhs.encode('hex')))
679 print('line_separator_for_rhs: {}'.format(self.cxt.line_separator_for_rhs.encode('hex')))
680 print('quote_char_for_lhs: {}'.format(self.cxt.quote_char_for_lhs))
681 print('quote_char_for_rhs: {}'.format(self.cxt.quote_char_for_rhs))
682 print('skips_space_after_column_separator_for_lhs: {}'.format(self.cxt.skips_space_after_column_separator_for_lhs))
683 print('skips_space_after_column_separator_for_rhs: {}'.format(self.cxt.skips_space_after_column_separator_for_rhs))
686class DetailReporter:
688 __metaclass__ = abc.ABCMeta
690 def __init__(self, context):
691 self.cxt = context
694 def report_detail_heading(self):
696 if not self.cxt.shows_details:
697 return
699 self._report_content_heading()
700 self._report_file_name()
702 @spacing_before(1)
703 def _report_content_heading(self):
705 if self.cxt.shows_difference_only:
706 print('* Differences')
707 elif self.cxt.shows_all_lines:
708 print('* All')
709 else:
710 pass
712 @abc.abstractmethod
713 def _report_file_name(self):
714 raise NotImplementedError()
717 @abc.abstractmethod
718 def report_case_of_existed_only_on_lhs(self, lhs_fact):
719 raise NotImplementedError()
721 @abc.abstractmethod
722 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
723 raise NotImplementedError()
725 @abc.abstractmethod
726 def report_case_of_existed_only_on_rhs(self, rhs_fact):
727 raise NotImplementedError()
730 class Factory:
732 def __init__(self):
733 pass
735 @staticmethod
736 def reporter_for(context, scan_result):
738 if context.reports_in_vertical_style:
739 return VerticalReporter(context, scan_result)
740 else:
741 return HorizontalReporter(context, scan_result)
744class HorizontalReporter(DetailReporter):
746 class Template:
748 DIFFERENT_COLUMN_GUIDE = 'Column indices with difference'
749 PREFIX_of_DIFF_COLUMNS = ' @ '
751 def __init__(self, lhs_max_row_number_length, lhs_max_row_length, rhs_max_row_number_length, rhs_max_row_length):
753 self.lhs_max_row_number_length = lhs_max_row_number_length
754 self.lhs_filler_length = 1
755 self.lhs_max_row_length = lhs_max_row_length
756 self.diff_mark_filler_length_in_front = 2
757 self.diff_mark_length = 1
758 self.diff_mark_filler_length_in_rear = 2
759 self.rhs_max_row_number_length = rhs_max_row_number_length
760 self.rhs_filler_length = 1
761 self.rhs_max_row_length = rhs_max_row_length
762 self.prefix_length_for_diff_columns_displays = len(self.PREFIX_of_DIFF_COLUMNS)
764 self.lhs_length = self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length
765 self.diff_mark_length = self.diff_mark_filler_length_in_front + self.diff_mark_length + self.diff_mark_filler_length_in_rear
766 self.rhs_length = self.rhs_max_row_number_length + self.rhs_filler_length + self.rhs_max_row_length
769 # --- heading-related description ---
771 def division_string(self):
772 return '-' * (self.lhs_length + self.diff_mark_length + self.rhs_length + self.prefix_length_for_diff_columns_displays + len(self.DIFFERENT_COLUMN_GUIDE))
774 def file_name_description(self, lhs_file_name, rhs_file_name):
776 lhs_file_name = lhs_file_name.ljust(self.lhs_length)
777 diff_mark_spacing = ' ' * self.diff_mark_length
778 rhs_file_name = rhs_file_name.ljust(self.rhs_length)
779 prefix_length_spacing = ' ' * self.prefix_length_for_diff_columns_displays
780 different_column_guide = self.DIFFERENT_COLUMN_GUIDE
781 return '%(lhs_file_name)s%(diff_mark_spacing)s%(rhs_file_name)s%(prefix_length_spacing)s%(different_column_guide)s' % locals()
784 # --- left-hand side related description ---
786 def lhs_only_description(self, lhs_fact):
788 lhs = self._lhs_description(lhs_fact)
789 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.LHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)
790 return '%(lhs)s%(diff_mark_area)s' % locals()
792 def _lhs_description(self, lhs_fact):
794 lhs_row_number = str(lhs_fact.lhs_row_number).rjust(self.lhs_max_row_number_length)
795 spacing = ' ' * self.lhs_filler_length
796 lhs_row = str(lhs_fact.lhs_row).ljust(self.lhs_max_row_length)
797 return '%(lhs_row_number)s%(spacing)s%(lhs_row)s' % locals()
799 def _lhs_empty_description(self):
800 return ' ' * (self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length)
803 # --- right-hand side related description ---
805 def rhs_only_description(self, rhs_fact):
807 empty_lhs = self._lhs_empty_description()
808 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.RHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)
809 rhs = self._rhs_description(rhs_fact)
810 return '%(empty_lhs)s%(diff_mark_area)s%(rhs)s' % locals()
812 def _rhs_description(self, rhs_fact):
814 rhs_row_number = str(rhs_fact.rhs_row_number).rjust(self.rhs_max_row_number_length)
815 spacing = ' ' * self.rhs_filler_length
816 rhs_row = str(rhs_fact.rhs_row).ljust(self.rhs_max_row_length)
817 return '%(rhs_row_number)s%(spacing)s%(rhs_row)s' % locals()
820 # --- both sides related description ---
822 def both_description(self, lhs_fact, rhs_fact, value_difference_result):
824 lhs = self._lhs_description(lhs_fact)
825 diff_mark = Mark.HAS_DIFF if value_difference_result.has_difference else Mark.NON_DIFF
826 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + diff_mark + (' ' * self.diff_mark_filler_length_in_rear)
827 rhs = self._rhs_description(rhs_fact)
828 prefix_of_diff_columns = self.PREFIX_of_DIFF_COLUMNS if value_difference_result.has_difference else ''
829 different_columns = str(value_difference_result.different_column_indices) if value_difference_result.has_difference else ''
830 return '%(lhs)s%(diff_mark_area)s%(rhs)s%(prefix_of_diff_columns)s%(different_columns)s' % locals()
833 def __init__(self, context, scan_result):
835 super(HorizontalReporter, self).__init__(context)
836 self.cxt = context
838 if context.needs_size_info_for_padding:
839 size_info = scan_result.size_info_for_padding
840 self.template = HorizontalReporter.Template(len(str(size_info.lhs_max_row_number)),
841 size_info.lhs_max_row_length,
842 len(str(size_info.rhs_max_row_number)),
843 size_info.rhs_max_row_length)
844 else:
845 self.template = None
848 # --- report heading related ---
850 def _report_file_name(self):
852 print(self.template.division_string())
853 print(self.template.file_name_description(os.path.basename(self.cxt.lhs_file_name), os.path.basename(self.cxt.rhs_file_name)))
854 print(self.template.division_string())
857 # --- report each cases ---
859 def report_case_of_existed_only_on_lhs(self, lhs_fact):
861 if self.cxt.shows_details:
862 print(self.template.lhs_only_description(lhs_fact))
864 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
866 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:
867 print(self.template.both_description(lhs_fact, rhs_fact, value_difference_result))
869 def report_case_of_existed_only_on_rhs(self, rhs_fact):
871 if self.cxt.shows_details:
872 print(self.template.rhs_only_description(rhs_fact))
875class VerticalReporter(DetailReporter):
877 class Template:
879 LHS_MARK = 'L'
880 RHS_MARK = 'R'
881 PREFIX_of_DIFF_COLUMNS = '@'
883 def __init__(self):
884 pass
887 # --- heading-related description ---
889 @classmethod
890 def division_string(cls):
891 return '-' * 80
893 @classmethod
894 def file_name_description(cls, mark, file_name):
895 return mark + ' ' + file_name
898 # --- left-hand side related description ---
900 @classmethod
901 def lhs_only_description(cls, lhs_fact):
902 return Mark.LHS_ONLY + ' ' + cls.LHS_MARK + ' ' + str(lhs_fact.lhs_row_number) + ' ' + str(lhs_fact.lhs_row)
905 # --- right-hand side related description ---
907 @classmethod
908 def rhs_only_description(cls, rhs_fact):
909 return Mark.RHS_ONLY + ' ' + cls.RHS_MARK + ' ' + str(rhs_fact.rhs_row_number) + ' ' + str(rhs_fact.rhs_row)
912 # --- both sides related description ---
914 @classmethod
915 def both_description_heading(cls, value_difference_result):
917 if value_difference_result.has_difference:
918 return Mark.HAS_DIFF + ' ' + cls.PREFIX_of_DIFF_COLUMNS + ' ' + str(value_difference_result.different_column_indices)
919 else:
920 return Mark.NON_DIFF_EXPRESSLY
922 @classmethod
923 def both_description_lhs(cls, lhs_fact, row_number_length):
924 return ' ' + cls.LHS_MARK + ' ' + str(lhs_fact.lhs_row_number).rjust(row_number_length) + ' ' + str(lhs_fact.lhs_row)
926 @classmethod
927 def both_description_rhs(cls, rhs_fact, row_number_length):
928 return ' ' + cls.RHS_MARK + ' ' + str(rhs_fact.rhs_row_number).rjust(row_number_length) + ' ' + str(rhs_fact.rhs_row)
932 def __init__(self, context, _):
934 super(VerticalReporter, self).__init__(context)
935 self.cxt = context
936 self.template = VerticalReporter.Template()
939 # --- report heading related ---
941 def _report_file_name(self):
943 print(self.template.division_string())
944 print(self.template.file_name_description(self.template.LHS_MARK, os.path.basename(self.cxt.lhs_file_name)))
945 print(self.template.file_name_description(self.template.RHS_MARK, os.path.basename(self.cxt.rhs_file_name)))
946 print(self.template.division_string())
949 # --- report each cases ---
951 def report_case_of_existed_only_on_lhs(self, lhs_fact):
953 if self.cxt.shows_details:
954 print(self.template.lhs_only_description(lhs_fact))
956 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
958 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:
960 row_number_length = max(len(str(lhs_fact.lhs_row_number)), len(str(rhs_fact.rhs_row_number)))
962 print(self.template.both_description_heading(value_difference_result))
963 print(self.template.both_description_lhs(lhs_fact, row_number_length))
964 print(self.template.both_description_rhs(rhs_fact, row_number_length))
966 def report_case_of_existed_only_on_rhs(self, rhs_fact):
968 if self.cxt.shows_details:
969 print(self.template.rhs_only_description(rhs_fact))
972class CountReporter:
974 class Counter:
976 def __init__(self):
978 self.number_of_same_lines = 0
979 self.number_of_lhs_only = 0
980 self.number_of_rhs_only = 0
981 self.number_of_differences = 0
983 self.row_numbers_for_lhs_only = []
984 self.row_numbers_for_rhs_only = []
985 self.row_numbers_for_differences = {}
987 self._max_digit = None
989 def _increment_same_lines(self):
990 self.number_of_same_lines += 1
992 def _increment_lhs_only(self):
993 self.number_of_lhs_only += 1
995 def _increment_rhs_only(self):
996 self.number_of_rhs_only += 1
998 def _increment_differences(self):
999 self.number_of_differences += 1
1001 def _add_row_number_for_lhs_only(self, row_number):
1002 self.row_numbers_for_lhs_only.append(row_number)
1004 def _add_row_number_for_rhs_only(self, row_number):
1005 self.row_numbers_for_rhs_only.append(row_number)
1007 def _add_row_number_for_differences(self, lhs_row_number, rhs_row_number):
1008 self.row_numbers_for_differences[lhs_row_number] = rhs_row_number
1011 def count_for_case_of_existed_only_on_lhs(self, row_number):
1012 self._increment_lhs_only()
1013 self._add_row_number_for_lhs_only(row_number)
1015 def count_for_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
1017 if value_difference_result.has_difference:
1018 self._increment_differences()
1019 self._add_row_number_for_differences(lhs_fact.lhs_row_number, rhs_fact.rhs_row_number)
1020 else:
1021 self._increment_same_lines()
1023 def count_for_case_of_existed_only_on_rhs(self, row_number):
1024 self._increment_rhs_only()
1025 self._add_row_number_for_rhs_only(row_number)
1027 @property
1028 def sorted_row_numbers_for_differences(self):
1029 return sorted(self.row_numbers_for_differences.items(), key=lambda x: x[0])
1032 @property
1033 def max_digit(self):
1035 if self._max_digit is not None:
1036 return self._max_digit
1038 self._max_digit = max(
1039 len(str(self.number_of_same_lines)),
1040 len(str(self.number_of_lhs_only)),
1041 len(str(self.number_of_rhs_only)),
1042 len(str(self.number_of_differences)),
1043 )
1044 return self._max_digit
1047 def __init__(self, shows_count):
1048 self.shows_count = shows_count
1049 self.counter = self.Counter()
1052 def _func_of_right_justified_number(self):
1053 return lambda number: str(number).rjust(self.counter.max_digit)
1055 @spacing_before(1)
1056 def report_count(self):
1058 if not self.shows_count:
1059 return
1061 print('* Count & Row number')
1063 rjust = self._func_of_right_justified_number()
1064 print('same lines : {}'.format(rjust(self.counter.number_of_same_lines)))
1065 print('left side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.LHS_ONLY, rjust(self.counter.number_of_lhs_only), self.counter.row_numbers_for_lhs_only))
1066 print('right side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.RHS_ONLY, rjust(self.counter.number_of_rhs_only), self.counter.row_numbers_for_rhs_only))
1067 print('with differences ({}): {} :-- Row Number Pairs -->: {}'.format(Mark.HAS_DIFF, rjust(self.counter.number_of_differences), self.counter.sorted_row_numbers_for_differences))
1070# ----------------------------------------------------------------------------------------------------------------------
1071# CSV Reading
1072# ----------------------------------------------------------------------------------------------------------------------
1074class FileArrangement(type):
1076 LHS = '_for_lhs'
1077 RHS = '_for_rhs'
1080class CsvDialectFixer:
1082 def __init__(self):
1083 pass
1085 @classmethod
1086 def fixed_dialect(cls, context, csv_file, file_arrangement):
1088 if context.forces_individual_specs:
1089 return cls._dialect_from_context(context, file_arrangement)
1090 else:
1091 return cls._try_sniffing(context, csv_file, file_arrangement)
1094 @classmethod
1095 def _dialect_from_context(cls, context, file_arrangement):
1097 dialect = csv.excel()
1098 dialect.delimiter = getattr(context, "column_separator" + file_arrangement)
1099 dialect.lineterminator = getattr(context, "line_separator" + file_arrangement)
1100 dialect.quotechar = getattr(context, "quote_char" + file_arrangement)
1101 dialect.skipinitialspace = getattr(context, "skips_space_after_column_separator" + file_arrangement)
1103 return dialect, context
1105 @classmethod
1106 def _try_sniffing(cls, context, csv_file, file_arrangement):
1108 try:
1109 return cls._sniff(context, csv_file, file_arrangement)
1111 except csv.Error as e:
1113 logger.warning('Sniffing failed. Generated a dialect from context instead. [type={}, args={}, message={}]'.format(type(e), str(e.args), e.message))
1114 return cls._dialect_from_context(context, file_arrangement)
1116 finally:
1117 csv_file.seek(0)
1119 @classmethod
1120 def _sniff(cls, context, csv_file, file_arrangement):
1122 sample = csv_file.read(context.sniffing_size)
1123 sniffer = csv.Sniffer()
1124 dialect = sniffer.sniff(sample)
1125 has_header = sniffer.has_header(sample)
1127 adjusted_context = cls._adjust_context_with(dialect, has_header, context, file_arrangement)
1129 return dialect, adjusted_context
1131 @classmethod
1132 def _adjust_context_with(cls, dialect, has_header, context, file_arrangement):
1134 setattr(context, "column_separator" + file_arrangement, dialect.delimiter)
1135 setattr(context, "line_separator" + file_arrangement, dialect.lineterminator)
1136 setattr(context, "quote_char" + file_arrangement, dialect.quotechar)
1137 setattr(context, "skips_space_after_column_separator" + file_arrangement, dialect.skipinitialspace)
1138 context.first_row_is_header = has_header if context.header is None else (True if context.header == 'y' else False)
1140 return context
1143def debug_log_dialect(dialect, context, message):
1145 logger.debug('---{}---'.format(message))
1146 logger.debug('sniffing dialect={}'.format(dialect))
1147 logger.debug('sniffing dialect csv.excel={}'.format(isinstance(dialect, csv.excel)))
1148 logger.debug('sniffing dialect csv.excel_tab={}'.format(isinstance(dialect, csv.excel_tab)))
1149 logger.debug('sniffing dialect.delimiter={}'.format(dialect.delimiter.encode('hex')))
1150 logger.debug('sniffing dialect.doublequote={}'.format(dialect.doublequote))
1151 logger.debug('sniffing dialect.escapechar={}'.format(dialect.escapechar))
1152 logger.debug('sniffing dialect.lineterminator={}'.format(dialect.lineterminator.encode('hex')))
1153 logger.debug('sniffing dialect.quotechar={}'.format(dialect.quotechar))
1154 logger.debug('sniffing dialect.quoting={}'.format(dialect.quoting))
1155 logger.debug('sniffing dialect.skipinitialspace={}'.format(dialect.skipinitialspace))
1158class LhsFact:
1160 def __init__(self, lhs_row_number, lhs_row, lhs_key):
1162 logger.debug('LhsFact 生成 lhs_row_number={}, lhs_row={}, lhs_key={}'.format(lhs_row_number, lhs_row, lhs_key))
1164 self.lhs_row_number = lhs_row_number
1165 self.lhs_row = lhs_row
1166 self.lhs_key = lhs_key
1169class RhsFact:
1171 def __init__(self, rhs_row_number, rhs_row, rhs_key):
1173 logger.debug('RhsFact 生成 rhs_row_number={}, rhs_row={}, rhs_key={}'.format(rhs_row_number, rhs_row, rhs_key))
1175 self.rhs_row_number = rhs_row_number
1176 self.rhs_row = rhs_row
1177 self.rhs_key = rhs_key
1180class CsvReader:
1182 class State:
1184 def __init__(self, csv_file, dialect, file_name, first_row_is_header):
1186 self._csv_file = csv_file
1187 self._dialect = dialect
1188 self._file_name = file_name
1189 self._first_row_is_header = first_row_is_header
1191 self._csv_reader = csv.reader(csv_file, dialect)
1192 self._row_number = 0
1193 self._previous_key = ""
1195 def reset(self):
1197 self._csv_file.seek(0)
1198 self._csv_reader = csv.reader(self._csv_file, self._dialect)
1199 self._row_number = 0
1200 self._previous_key = ""
1202 def increment_row_number(self):
1204 if self._previous_key == MatchingKeyCodec.END_of_KEY:
1205 return
1207 self._row_number += 1
1209 def key_changed(self, new_key):
1211 if self._is_header():
1212 return
1214 self._previous_key = new_key
1216 def _is_header(self):
1217 return self.row_number == 0 and self._first_row_is_header
1219 @property
1220 def csv_reader(self):
1221 return self._csv_reader
1223 @property
1224 def file_name(self):
1225 return self._file_name
1227 @property
1228 def row_number(self):
1229 return self._row_number
1231 @property
1232 def previous_key(self):
1233 return self._previous_key
1236 def __init__(self, lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, context):
1238 debug_log_dialect(lhs_dialect, context, '左CSV')
1239 debug_log_dialect(rhs_dialect, context, '右CSV')
1241 self.lhs_csv_state = CsvReader.State(lhs_csv, lhs_dialect, context.lhs_file_name, context.first_row_is_header)
1242 self.rhs_csv_state = CsvReader.State(rhs_csv, rhs_dialect, context.rhs_file_name, context.first_row_is_header)
1243 self.cxt = context
1245 self.skip_header()
1247 def skip_header(self):
1249 if self.cxt.first_row_is_header:
1250 _ = self.read_lhs()
1251 _ = self.read_rhs()
1253 def reset(self):
1255 self.lhs_csv_state.reset()
1256 self.rhs_csv_state.reset()
1257 self.skip_header()
1259 def read_lhs(self):
1261 lhs_row, lhs_key = self._read_csv(self.lhs_csv_state)
1262 self.lhs_csv_state.increment_row_number()
1263 return LhsFact(self.lhs_csv_state.row_number, lhs_row, lhs_key)
1265 def read_rhs(self):
1267 rhs_row, rhs_key = self._read_csv(self.rhs_csv_state)
1268 self.rhs_csv_state.increment_row_number()
1269 return RhsFact(self.rhs_csv_state.row_number, rhs_row, rhs_key)
1271 def _read_csv(self, csv_state):
1273 try:
1274 row = next(csv_state.csv_reader)
1275 except StopIteration:
1276 csv_state.key_changed(MatchingKeyCodec.END_of_KEY)
1277 return [], MatchingKeyCodec.END_of_KEY
1279 new_key = self.cxt.matching_key_codec.managed_key_for(row)
1280 self._detect_key_violation(new_key, csv_state)
1282 csv_state.key_changed(new_key)
1284 return row, new_key
1286 def _detect_key_violation(self, new_key, csv_state):
1288 if csv_state.previous_key == '':
1289 return
1291 if new_key < csv_state.previous_key:
1292 logger.error('matching keys in {} are not sorted.'
1293 ' [current_key={}, previous_key={}, matching-key-indices={}] If the key is a number without zero padding, specify the max size of the key after colon like -k0:8.'.format(
1294 csv_state.file_name, MatchingKeyCodec.decode_key(new_key), MatchingKeyCodec.decode_key(csv_state.previous_key), self.cxt.matching_key_codec.matching_key_info_list))
1295 exit(1)
1297 if self.cxt.key_should_be_unique and new_key == csv_state.previous_key:
1298 logger.error('matching keys in {} are not unique.'
1299 ' [current_key={}, previous_key={}, matching-key-indices={}]'.format(
1300 csv_state.file_name, MatchingKeyCodec.decode_key(new_key), MatchingKeyCodec.decode_key(csv_state.previous_key), self.cxt.matching_key_codec.matching_key_info_list))
1301 exit(1)
1304if __name__ == '__main__':
1306 main()