Coverage for src/csvdiff3/csvdiff.py : 96%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3import abc
4import binascii
5import csv
6import functools
7import logging
8import os
9import sys
10import time
11import traceback
12import unicodedata
13from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
14from logging import Logger
17# ----------------------------------------------------------------------------------------------------------------------
18# Decorators
19# ----------------------------------------------------------------------------------------------------------------------
21def show_execution_time():
23 def _execution_time(func):
25 def wrapper(*args, **kwargs):
27 start = time.perf_counter()
29 func(*args, **kwargs)
31 elapsed_time = time.perf_counter() - start
32 print()
33 print(f'elapsed_time={elapsed_time}[sec]')
34 print()
36 return wrapper
38 return _execution_time
41def spacing_before(number_of_lines):
43 number_of_lines = number_of_lines or 1
45 def _spacing_before(func):
47 def wrapper(*args, **kwargs):
49 for i in range(number_of_lines):
50 print('')
52 func(*args, **kwargs)
54 return wrapper
56 return _spacing_before
59# ----------------------------------------------------------------------------------------------------------------------
60# Entrance
61# ----------------------------------------------------------------------------------------------------------------------
63# @show_execution_time()
64def main():
66 configure()
68 context = context_from_arguments()
69 show_context_for_debugging(context)
71 try:
72 run_in(context)
73 except IndexError as e:
74 logger.error(f'It is possible that the number of columns in the row is not aligned. Please check the csv data. If not, please file an issue. [{type(e)}, description={e}]')
75 sys.exit(1)
78class App(type):
80 NAME = 'csv-diff-python3@blue-monk'
81 VERSION = '1.0.0'
84class LoggingConfig(type):
86 # For debug, play with the CONSOLE_LEVEL or FILE_LEVEL.
88 BASE_LEVEL = logging.DEBUG
90 CONSOLE_LEVEL = logging.ERROR
91 CONSOLE_FORMAT = '%(levelname)s: %(message)s'
93 FILE_LEVEL = logging.WARNING
94 FILE_FORMAT = '%(asctime)s: %(levelname)s: %(message)s'
95 FILE_PATH = 'csvdiff.log'
98logger: Logger = logging.getLogger(__name__)
101def configure():
103 logging.basicConfig(level=LoggingConfig.BASE_LEVEL)
105 stream_handler = logging.StreamHandler()
106 stream_handler.setLevel(LoggingConfig.CONSOLE_LEVEL)
107 stream_handler.setFormatter(logging.Formatter(LoggingConfig.CONSOLE_FORMAT))
109 file_handler = logging.FileHandler(filename=LoggingConfig.FILE_PATH, mode='w')
110 file_handler.setLevel(LoggingConfig.FILE_LEVEL)
111 file_handler.setFormatter(logging.Formatter(LoggingConfig.FILE_FORMAT))
113 logger.addHandler(stream_handler)
114 logger.addHandler(file_handler)
116 logger.propagate = False
119# ----------------------------------------------------------------------------------------------------------------------
120# Context Preparation
121# ----------------------------------------------------------------------------------------------------------------------
123def context_from_arguments():
125 def arg_type_matching_key_in_csv(x):
126 return list(map(MatchingKeyInfo, x.split(',')))
128 def arg_type_int_in_csv(x):
129 return list(map(int, x.split(',')))
132 parser = ArgumentParser(prog=App.NAME, formatter_class=ArgumentDefaultsHelpFormatter)
134 # Program name & Version -------------------------------------------------------------------------------------------
135 parser.add_argument('--version', action='version', version=f'%(prog)s {App.VERSION}')
137 # Input CSV file paths ---------------------------------------------------------------------------------------------
138 parser.add_argument('lhs_file_name', type=str, help='Absolute/Relative path to left-hand side file.')
139 parser.add_argument('rhs_file_name', type=str, help='Absolute/Relative path to right-hand side file.')
141 # Input CSV file encodings -----------------------------------------------------------------------------------------
142 parser.add_argument('-e', '--encoding', type=str, default=None,
143 help='Encoding of the CSV files. (refer public reference named "Standard encoding") e.g.: shift_jis')
145 parser.add_argument('--encoding-for-lhs', type=str, default='utf8',
146 help='Encoding of the CSV file on the left side. (refer public reference named "Standard encoding") e.g.: shift_jis')
147 parser.add_argument('--encoding-for-rhs', type=str, default='utf8',
148 help='Encoding of the CSV file on the right side. (refer public reference named "Standard encoding") e.g.: shift_jis')
150 # Matching conditions ----------------------------------------------------------------------------------------------
151 parser.add_argument('-k', '--matching-keys', type=arg_type_matching_key_in_csv, default='0',
152 help='Matching key indices(from 0) for Input CSV in CSV format. For non-fixed length numbers, specify the number of digits after ":". e.g.: 0:8,3')
153 parser.add_argument('-u', '--unique-key', default=False, action='store_true',
154 help="Specify if the matching key is unique. Then, if it detects that the matching key is not unique, an error will occur.")
155 parser.add_argument('-i', '--ignore-columns', type=arg_type_int_in_csv, default=[],
156 help='Specify the index of the column to be ignored in CSV format. e.g.: 3,7')
158 # Report styles ----------------------------------------------------------------------------------------------------
159 parser.add_argument('-v', '--vertical-style', default=False, action='store_true',
160 help='Report in vertical style. If not specified, report in horizontal(two facing) style.')
162 parser.add_argument('-c', '--show-count', default=False, action='store_true',
163 help='Report the number of differences. Treat this as True if neither -d nor -a is specified.')
165 display_group = parser.add_mutually_exclusive_group()
166 display_group.add_argument('-d', '--show-difference-only', default=False, action='store_true',
167 help='Report the lines with the difference. Can be used with option -c. Cannot be used with option -a.')
168 display_group.add_argument('-a', '--show-all-lines', action='store_true',
169 help='Report on all lines. Can be used with option -c. Cannot be used with option -d.')
171 parser.add_argument('-x', '--show-context-from-arguments', default=False, action='store_true',
172 help='Report the context generated from the arguments and CSV sniffing.')
174 # CSV analysis conditions ------------------------------------------------------------------------------------------
175 parser.add_argument('-H', '--header', type=str, default=None, choices=['n', 'y'],
176 help='If specified, this specification will be enforced.')
178 parser.add_argument('-S', '--sniffing-size', type=str, default=4096,
179 help="If csv sniffing fails, try specifying a size larger than 4096. Or Explicitly specify CSV file conditions like '--column-separator-for-lhs TAB'. Check help with -h option.")
181 parser.add_argument('-F', '--force-individual-specs', action='store_true',
182 help="If you don't want to rely on csv sniffing, specify it, and then specify --column-separator and so on separately.")
184 parser.add_argument('--column-separator', type=str, default=None, choices=['COMMA', 'TAB', 'SEMICOLON'],
185 help='Process both sides CSV file using the specified column delimiter.')
187 parser.add_argument('--line-separator', type=str, default=None, choices=['LF', 'CRLF'],
188 help='Process both sides CSV file using the specified line separator.')
190 parser.add_argument('--quote-char', type=str, default=None, choices=['"', "'"],
191 help='Process both sides CSV file using the specified quote character.')
193 parser.add_argument('--no-skip-space-after-column-separator', action='store_true',
194 help='Specify when you want to treat the space immediately after the separator as data for the both sides CSV file.')
196 # CSV analysis conditions by left and right ------------------------------------------------------------------------
197 parser.add_argument('--column-separator-for-lhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],
198 help='Process left-hand side CSV file using the specified column delimiter.')
200 parser.add_argument('--column-separator-for-rhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],
201 help='Process right-hand side CSV file using the specified column delimiter.')
203 parser.add_argument('--line-separator-for-lhs', type=str, default="LF", choices=['LF', 'CRLF'],
204 help='Process left-hand side CSV file using the specified line separator.')
206 parser.add_argument('--line-separator-for-rhs', type=str, default="LF", choices=['LF', 'CRLF'],
207 help='Process right-hand side CSV file using the specified line separator.')
209 parser.add_argument('--quote-char-for-lhs', type=str, default='"', choices=['"', "'"],
210 help='Process left-hand side CSV file using the specified quote character.')
212 parser.add_argument('--quote-char-for-rhs', type=str, default='"', choices=['"', "'"],
213 help='Process right-hand side CSV file using the specified quote character.')
215 parser.add_argument('--no-skip-space-after-column-separator-for-lhs', default=False, action='store_true',
216 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the left side.')
218 parser.add_argument('--no-skip-space-after-column-separator-for-rhs', default=False, action='store_true',
219 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the right side.')
221 # ------------------------------------------------------------------------------------------------------------------
223 return Context(parser.parse_args())
226class Context:
228 LINE_SEPARATOR_s = {
229 "CR": '\r',
230 "LF": '\n',
231 "CRLF": '\r\n',
232 None: '<None>',
233 }
235 COLUMN_SEPARATOR_s = {
236 "COMMA": ',',
237 "TAB": '\t',
238 "SEMICOLON": ';',
239 None: '<None>',
240 }
242 def __init__(self, args):
244 # Input CSV file paths -----------------------------------------------------------------------------------------
245 self.lhs_file_name = args.lhs_file_name
246 self.rhs_file_name = args.rhs_file_name
247 self.lhs_file_path = os.path.abspath(args.lhs_file_name)
248 self.rhs_file_path = os.path.abspath(args.rhs_file_name)
250 # Input CSV file encodings -------------------------------------------------------------------------------------
251 if args.encoding:
252 self.encoding_for_lhs = args.encoding
253 self.encoding_for_rhs = args.encoding
254 else:
255 self.encoding_for_lhs = args.encoding_for_lhs
256 self.encoding_for_rhs = args.encoding_for_rhs
258 # Matching conditions ------------------------------------------------------------------------------------------
259 self.matching_key_codec = MatchingKeyCodec(args.matching_keys)
260 self.key_should_be_unique = args.unique_key
261 self.column_indices_to_ignore = args.ignore_columns
263 # Report styles ------------------------------------------------------------------------------------------------
264 self.reports_in_vertical_style = args.vertical_style
265 self.reports_in_horizontal_style = not args.vertical_style
267 self.shows_count = args.show_count
268 self.shows_difference_only = args.show_difference_only
269 self.shows_all_lines = args.show_all_lines
270 self.shows_details = True if self.shows_difference_only or self.shows_all_lines else False
271 self.shows_context_from_arguments = args.show_context_from_arguments
273 self.needs_size_info_for_padding = self.shows_details and self.reports_in_horizontal_style
275 # CSV analysis conditions --------------------------------------------------------------------------------------
276 self.header = args.header
277 self.first_row_is_header = None
279 self.sniffing_size = args.sniffing_size
281 self.forces_individual_specs = args.force_individual_specs
283 if self.forces_individual_specs and args.column_separator:
284 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator]
285 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator]
286 else:
287 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_lhs]
288 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_rhs]
290 if self.forces_individual_specs and args.line_separator:
291 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator]
292 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator]
293 else:
294 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator_for_lhs]
295 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator_for_rhs]
297 if self.forces_individual_specs and args.quote_char:
298 self.quote_char_for_lhs = args.quote_char
299 self.quote_char_for_rhs = args.quote_char
300 else:
301 self.quote_char_for_lhs = args.quote_char_for_lhs
302 self.quote_char_for_rhs = args.quote_char_for_rhs
304 if self.forces_individual_specs and args.no_skip_space_after_column_separator:
305 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator
306 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator
307 else:
308 self.skips_space_after_column_separator_for_lhs = True
309 self.skips_space_after_column_separator_for_rhs = True
311 if self.forces_individual_specs and args.no_skip_space_after_column_separator:
312 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator
313 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator
314 else:
315 self.skips_space_after_column_separator_for_lhs = True
316 self.skips_space_after_column_separator_for_rhs = True
319 self._validate()
320 self._normalize()
322 def _validate(self):
324 if not os.path.exists(self.lhs_file_path):
325 logger.error(f'lhs_file_path not exists. [lhs_file_path={self.lhs_file_path}]')
326 sys.exit(1)
327 if not os.path.exists(self.rhs_file_path):
328 logger.error(f'rhs_file_path not exists. [rhs_file_path={self.rhs_file_path}]')
329 sys.exit(1)
331 if not os.path.isfile(self.lhs_file_path):
332 logger.error(f'lhs_file_path is not a file. [lhs_file_path={self.lhs_file_path}]')
333 sys.exit(1)
334 if not os.path.isfile(self.rhs_file_path):
335 logger.error(f'rhs_file_path is not a file. [rhs_file_path={self.rhs_file_path}]')
336 sys.exit(1)
338 def _normalize(self):
340 if not any([self.shows_count, self.shows_difference_only, self.shows_all_lines]):
341 self.shows_count = True
343 def display_string_for_column_separator(self, value):
345 candidates = [k for k, v in self.COLUMN_SEPARATOR_s.items() if v == value]
346 if candidates:
347 return candidates[0]
348 else:
349 f'undefined({value})'
351 def display_string_for_line_separator(self, value, file_arrangement):
353 encoding_value = getattr(self, "encoding" + file_arrangement)
354 return binascii.hexlify(value.encode(encoding_value)).decode()
357def show_context_for_debugging(cxt):
359 logger.debug(f'lhs_file_name={cxt.lhs_file_name}')
360 logger.debug(f'rhs_file_name={cxt.rhs_file_name}')
361 logger.debug(f'lhs_file_path={cxt.lhs_file_path}')
362 logger.debug(f'rhs_file_path={cxt.rhs_file_path}')
364 logger.debug(f'encoding_for_lhs={cxt.encoding_for_lhs}')
365 logger.debug(f'encoding_for_rhs={cxt.encoding_for_rhs}')
367 logger.debug(f'matching_key_codec={cxt.matching_key_codec}')
368 logger.debug(f'key_should_be_unique={cxt.key_should_be_unique}')
369 logger.debug(f'column_indices_to_ignore={cxt.column_indices_to_ignore}')
371 logger.debug(f'reports_in_vertical_style={cxt.reports_in_vertical_style}')
372 logger.debug(f'reports_in_horizontal_style={cxt.reports_in_horizontal_style}')
373 logger.debug(f'shows_count={cxt.shows_count}')
374 logger.debug(f'shows_difference_only={cxt.shows_difference_only}')
375 logger.debug(f'shows_all_lines={cxt.shows_all_lines}')
376 logger.debug(f'shows_context_from_arguments={cxt.shows_context_from_arguments}')
377 logger.debug(f'needs_size_info_for_padding={cxt.needs_size_info_for_padding}')
379 logger.debug(f'first_row_is_header={cxt.first_row_is_header}')
380 logger.debug(f'sniffing_size={cxt.sniffing_size}')
381 logger.debug(f'force_individual_specs={cxt.forces_individual_specs}')
383 logger.debug(f'column_separator_for_lhs={cxt.display_string_for_column_separator(cxt.column_separator_for_lhs)}')
384 logger.debug(f'column_separator_for_rhs={cxt.display_string_for_column_separator(cxt.column_separator_for_rhs)}')
385 logger.debug(f'line_separator_for_lhs={cxt.display_string_for_line_separator(cxt.line_separator_for_lhs, FileArrangement.LHS)}')
386 logger.debug(f'line_separator_for_rhs={cxt.display_string_for_line_separator(cxt.line_separator_for_rhs, FileArrangement.RHS)}')
387 logger.debug(f'quote_char_for_lhs={cxt.quote_char_for_lhs}')
388 logger.debug(f'quote_char_for_rhs={cxt.quote_char_for_rhs}')
389 logger.debug(f'skips_space_after_column_separator_for_lhs={cxt.skips_space_after_column_separator_for_lhs}')
390 logger.debug(f'skips_space_after_column_separator_for_rhs={cxt.skips_space_after_column_separator_for_rhs}')
392 logger.debug(f'MatchingKeyCodec#END_of_KEY={MatchingKeyCodec.END_of_KEY}')
395# ----------------------------------------------------------------------------------------------------------------------
396# Matching Key Treatment
397# ----------------------------------------------------------------------------------------------------------------------
399class MatchingKeyInfo:
401 def __init__(self, specified_string):
403 elements = list(filter(lambda x: x != '', specified_string.split(':')))
405 index = elements.pop(0)
406 self.index = self._transform_into_numeric(index, 'index')
408 max_length = elements.pop(0) if elements else '0'
409 self.max_length = self._transform_into_numeric(max_length, 'max_length')
411 def __repr__(self):
412 return f"{self.__class__.__name__}({self.index!r}, {(self.max_length if self.max_length > 0 else '<not specified>')!r})"
414 @classmethod
415 def _transform_into_numeric(cls, value, name):
417 if not value.isdigit():
418 logger.error(f'MATCHING_KEY_INDICES should be a number. See also help. [specified {name}={value}]')
419 exit(1)
421 return int(value)
423 def key_for(self, row):
424 return row[self.index].rjust(self.max_length, '0')
427class MatchingKeyCodec:
429 END_of_KEY = 'ZZZ'
430 SEPARATOR = '..'
432 def __init__(self, matching_key_info_list):
433 self.matching_key_info_list = matching_key_info_list
435 def __repr__(self):
436 return f'{self.__class__.__name__}({self.matching_key_info_list!r})'
438 def managed_key_for(self, row):
440 try:
441 return functools.reduce(lambda making, matching_key: making + matching_key.key_for(row) + self.SEPARATOR,
442 self.matching_key_info_list, self.SEPARATOR)
443 except IndexError:
444 logger.error(f'one of the indices specified for MATCHING_KEY_INDICES is out of range [MATCHING_KEY_INDICES={self.matching_key_info_list}, number of columns = {len(row)}, row={row}]')
445 exit(1)
447 @property
448 def matching_key_indices(self):
449 return list(map(lambda matching_key_info: matching_key_info.index, self.matching_key_info_list))
451 @classmethod
452 def decode_key(cls, key):
453 """ Leave the padding as it is. """
454 return key.strip(cls.SEPARATOR).split(cls.SEPARATOR)
458# ----------------------------------------------------------------------------------------------------------------------
459# Control and Determine if it exists only on the left, only on the right, or both
460# ----------------------------------------------------------------------------------------------------------------------
462def run_in(context):
464 with open(context.lhs_file_path, mode='r', encoding=context.encoding_for_lhs) as lhs_csv,\
465 open(context.rhs_file_path, mode='r', encoding=context.encoding_for_rhs) as rhs_csv:
467 lhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(context, lhs_csv, FileArrangement.LHS)
468 rhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(adjusted_context, rhs_csv, FileArrangement.RHS)
470 csv_reader = CsvReader(lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, adjusted_context)
471 pre_scan_result = PreScanner.scan(adjusted_context, csv_reader)
472 csv_reader.reset()
474 detect_diff(adjusted_context, csv_reader, pre_scan_result)
477def detect_diff(context, csv_reader, pre_scan_result):
479 value_difference_detector = ValueDifferenceDetector(pre_scan_result.number_of_columns,
480 context.matching_key_codec.matching_key_indices,
481 context.column_indices_to_ignore)
483 heading_reporter = HeadingReporter(context)
484 detail_reporter = DetailReporter.Factory.reporter_for(context, pre_scan_result)
485 count_reporter = CountReporter(context.shows_count)
486 counter = count_reporter.counter
488 heading_reporter.report_heading()
489 detail_reporter.report_detail_heading()
492 def existed_only_on_lhs(lhs_fact):
493 counter.count_for_case_of_existed_only_on_lhs(lhs_fact.lhs_row_number)
494 detail_reporter.report_case_of_existed_only_on_lhs(lhs_fact)
496 def existed_on_both_sides(lhs_fact, rhs_fact):
497 value_difference_result = value_difference_detector.detect_difference_between(lhs_fact.lhs_row, rhs_fact.rhs_row)
498 counter.count_for_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)
499 detail_reporter.report_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)
501 def existed_only_on_rhs(rhs_fact):
502 counter.count_for_case_of_existed_only_on_rhs(rhs_fact.rhs_row_number)
503 detail_reporter.report_case_of_existed_only_on_rhs(rhs_fact)
505 perform_key_matching(csv_reader, existed_only_on_lhs, existed_on_both_sides, existed_only_on_rhs)
508 count_reporter.report_count()
511def perform_key_matching(csv_reader, callback_for_lhs_only, callback_for_both_sides, callback_for_rhs_only):
513 lhs_fact = csv_reader.read_lhs()
514 rhs_fact = csv_reader.read_rhs()
516 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY or rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:
518 if lhs_fact.lhs_key < rhs_fact.rhs_key:
519 callback_for_lhs_only(lhs_fact)
520 lhs_fact = csv_reader.read_lhs()
522 elif lhs_fact.lhs_key == rhs_fact.rhs_key:
523 callback_for_both_sides(lhs_fact, rhs_fact)
524 lhs_fact = csv_reader.read_lhs()
525 rhs_fact = csv_reader.read_rhs()
527 elif lhs_fact.lhs_key > rhs_fact.rhs_key:
528 callback_for_rhs_only(rhs_fact)
529 rhs_fact = csv_reader.read_rhs()
532# ----------------------------------------------------------------------------------------------------------------------
533# Value-Difference Detection
534# ----------------------------------------------------------------------------------------------------------------------
536class ValueDifferenceDetector:
538 class ValueDifferenceResult:
540 def __init__(self, different_column_indices):
542 self.different_column_indices = different_column_indices
544 @property
545 def has_difference(self):
546 return True if self.different_column_indices else False
549 def __init__(self, number_of_columns, matching_key_indices, ignore_column_indices):
551 self.column_indices = range(0, number_of_columns)
552 logger.debug(f'column_indices={self.column_indices}')
554 self.target_column_indices = set(self.column_indices) - set(matching_key_indices) - set(ignore_column_indices)
555 logger.debug(f'target_column_indices={self.target_column_indices}')
557 def detect_difference_between(self, lhs_row, rhs_row):
559 different_column_indices = [index for index in self.target_column_indices if lhs_row[index] != rhs_row[index]]
560 logger.debug(f'different_column_indices={different_column_indices}')
561 return self.ValueDifferenceResult(different_column_indices)
565# ----------------------------------------------------------------------------------------------------------------------
566# Reporting
567# ----------------------------------------------------------------------------------------------------------------------
569class PreScanner:
571 class ScanResult:
573 def __init__(self, number_of_columns, size_info_for_padding):
574 self.number_of_columns = number_of_columns
575 self.size_info_for_padding = size_info_for_padding
577 @classmethod
578 def for_lightly(cls, number_of_columns):
579 return PreScanner.ScanResult(number_of_columns, None)
581 @classmethod
582 def for_deeply(cls, number_of_columns, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):
583 size_info_for_padding = cls.SizeInfoForPadding(lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)
584 return PreScanner.ScanResult(number_of_columns, size_info_for_padding)
587 class SizeInfoForPadding:
589 def __init__(self, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):
590 self.lhs_max_row_number = lhs_max_row_number
591 self.lhs_max_row_length = lhs_max_row_length
592 self.rhs_max_row_number = rhs_max_row_number
593 self.rhs_max_row_length = rhs_max_row_length
596 def __init__(self):
597 pass
599 @classmethod
600 def scan(cls, context, csv_reader):
602 if context.needs_size_info_for_padding:
603 return PreScanner._scan_deeply(csv_reader)
604 else:
605 return PreScanner._scan_lightly(csv_reader)
608 @classmethod
609 def _scan_deeply(cls, csv_reader):
610 """
611 Notes
612 -----
613 Purpose of deep pre-scanning
614 * Determine the number of columns for value difference detection
615 * Get size information to format the horizontal report
616 """
618 start_ = time.perf_counter()
620 lhs_max_row_length, rhs_max_row_length = 0, 0
622 lhs_fact = csv_reader.read_lhs()
623 rhs_fact = csv_reader.read_rhs()
625 number_of_columns = cls._determine_number_of_columns_from(lhs_fact, rhs_fact)
627 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY:
628 lhs_max_row_length = max(lhs_max_row_length, UnicodeSupport.string_length_considering_east_asian_characters_of(str(lhs_fact.lhs_row)))
629 lhs_fact = csv_reader.read_lhs()
631 while rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:
632 rhs_max_row_length = max(rhs_max_row_length, UnicodeSupport.string_length_considering_east_asian_characters_of(str(rhs_fact.rhs_row)))
633 rhs_fact = csv_reader.read_rhs()
635 lhs_max_row_number = csv_reader.lhs_csv_state.row_number
636 rhs_max_row_number = csv_reader.rhs_csv_state.row_number
637 logger.debug(f'lhs_max_row_number={lhs_max_row_number}')
638 logger.debug(f'rhs_max_row_number={rhs_max_row_number}')
640 elapsed_time_ = time.perf_counter() - start_
641 logger.debug(f'PreScanner#scan() elapsed_time:{elapsed_time_}[sec]')
642 return PreScanner.ScanResult.for_deeply(number_of_columns,
643 lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)
645 @classmethod
646 def _scan_lightly(cls, csv_reader):
647 """
648 Notes
649 -----
650 Purpose of light pre-scanning
651 * Determine the number of columns for value difference detection
653 Vertical reports do not require size information for formatting.
654 """
656 lhs_fact = csv_reader.read_lhs()
657 rhs_fact = csv_reader.read_rhs()
659 return PreScanner.ScanResult.for_lightly(cls._determine_number_of_columns_from(lhs_fact, rhs_fact))
661 @classmethod
662 def _determine_number_of_columns_from(cls, lhs_fact, rhs_fact):
664 number_of_columns = 0
665 if lhs_fact.lhs_row:
666 number_of_columns = len(lhs_fact.lhs_row)
667 elif rhs_fact.rhs_row:
668 number_of_columns = len(rhs_fact.rhs_row)
670 return number_of_columns
674class Mark(type):
676 LHS_ONLY = '<'
677 RHS_ONLY = '>'
678 HAS_DIFF = '!'
679 NON_DIFF = ' '
680 NON_DIFF_EXPRESSLY = '='
683class HeadingReporter:
685 def __init__(self, context):
686 self.cxt = context
689 def report_heading(self):
691 self._report_title()
693 if self.cxt.shows_context_from_arguments:
694 self._report_context()
696 @classmethod
697 @spacing_before(1)
698 def _report_title(cls):
699 print('============ Report ============')
701 @spacing_before(1)
702 def _report_context(self):
704 print('● Context')
705 print(f'File Path on the Left-Hand Side: {self.cxt.lhs_file_path}')
706 print(f'File Path on the Right-Hand Side : {self.cxt.rhs_file_path}')
707 print(f'Matching Key Indices: {self.cxt.matching_key_codec.matching_key_info_list}')
708 print(f'Matching Key Is Unique?: {self.cxt.key_should_be_unique}')
709 print(f'Column Indices to Ignore: {self.cxt.column_indices_to_ignore}')
710 print(f'with Header?: {self.cxt.first_row_is_header}')
711 print(f'Report Style: {"Vertical" if self.cxt.reports_in_vertical_style else "Two facing (Horizontal)"}')
712 print(f'Show Count?: {self.cxt.shows_count}')
713 print(f'Show Difference Only?: {self.cxt.shows_difference_only}')
714 print(f'Show All?: {self.cxt.shows_all_lines}')
715 print(f'Show Context?: {self.cxt.shows_context_from_arguments}')
716 print(f'File Encoding for Left-Hand Side: {self.cxt.encoding_for_lhs}')
717 print(f'File Encoding for Right-Hand Side: {self.cxt.encoding_for_rhs}')
718 print(f'CSV Sniffing Size: {self.cxt.sniffing_size}')
719 print('--- csv analysis conditions ---')
720 print(f'Forces Individual Specified Conditions?: {self.cxt.forces_individual_specs}')
721 print(f'column_separator_for_lhs: {self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_lhs)}')
722 print(f'column_separator_for_rhs: {self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_rhs)}')
723 print(f'line_separator_for_lhs: {self.cxt.display_string_for_line_separator(self.cxt.line_separator_for_lhs, FileArrangement.LHS)}')
724 print(f'line_separator_for_rhs: {self.cxt.display_string_for_line_separator(self.cxt.line_separator_for_rhs, FileArrangement.RHS)}')
725 print(f'quote_char_for_lhs: {self.cxt.quote_char_for_lhs}')
726 print(f'quote_char_for_rhs: {self.cxt.quote_char_for_rhs}')
727 print(f'skips_space_after_column_separator_for_lhs: {self.cxt.skips_space_after_column_separator_for_lhs}')
728 print(f'skips_space_after_column_separator_for_rhs: {self.cxt.skips_space_after_column_separator_for_rhs}')
731class DetailReporter:
733 __metaclass__ = abc.ABCMeta
735 def __init__(self, context):
736 self.cxt = context
739 def report_detail_heading(self):
741 if not self.cxt.shows_details:
742 return
744 self._report_content_heading()
745 self._report_file_name()
747 @spacing_before(1)
748 def _report_content_heading(self):
749 if self.cxt.shows_difference_only:
750 print('● Differences')
751 elif self.cxt.shows_all_lines:
752 print('● All')
753 else:
754 pass
756 @abc.abstractmethod
757 def _report_file_name(self):
758 raise NotImplementedError()
761 @abc.abstractmethod
762 def report_case_of_existed_only_on_lhs(self, lhs_fact):
763 raise NotImplementedError()
765 @abc.abstractmethod
766 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
767 raise NotImplementedError()
769 @abc.abstractmethod
770 def report_case_of_existed_only_on_rhs(self, rhs_fact):
771 raise NotImplementedError()
774 class Factory:
776 def __init__(self):
777 pass
779 @staticmethod
780 def reporter_for(context, scan_result):
782 if context.reports_in_vertical_style:
783 return VerticalReporter(context, scan_result)
784 else:
785 return HorizontalReporter(context, scan_result)
788class HorizontalReporter(DetailReporter):
790 class Template:
792 DIFFERENT_COLUMN_GUIDE = 'Column indices with difference'
793 PREFIX_of_DIFF_COLUMNS = ' @ '
795 def __init__(self, lhs_max_row_number_length, lhs_max_row_length, rhs_max_row_number_length, rhs_max_row_length):
797 self.lhs_max_row_number_length = lhs_max_row_number_length
798 self.lhs_filler_length = 1
799 self.lhs_max_row_length = lhs_max_row_length
800 self.diff_mark_filler_length_in_front = 2
801 self.diff_mark_length = 1
802 self.diff_mark_filler_length_in_rear = 2
803 self.rhs_max_row_number_length = rhs_max_row_number_length
804 self.rhs_filler_length = 1
805 self.rhs_max_row_length = rhs_max_row_length
806 self.prefix_length_for_diff_columns_displays = len(self.PREFIX_of_DIFF_COLUMNS)
808 self.lhs_length = self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length
809 self.diff_mark_length = self.diff_mark_filler_length_in_front + self.diff_mark_length + self.diff_mark_filler_length_in_rear
810 self.rhs_length = self.rhs_max_row_number_length + self.rhs_filler_length + self.rhs_max_row_length
813 # --- heading-related description ---
815 def division_string(self):
816 return '-' * (self.lhs_length + self.diff_mark_length + self.rhs_length + self.prefix_length_for_diff_columns_displays + len(self.DIFFERENT_COLUMN_GUIDE))
818 def file_name_description(self, lhs_file_name, rhs_file_name):
820 lhs_file_name = UnicodeSupport.left_justified(lhs_file_name, self.lhs_length)
821 diff_mark_spacing = ' ' * self.diff_mark_length
822 rhs_file_name = UnicodeSupport.left_justified(rhs_file_name, self.rhs_length)
823 prefix_length_spacing = ' ' * self.prefix_length_for_diff_columns_displays
824 return f'{lhs_file_name}{diff_mark_spacing}{rhs_file_name}{prefix_length_spacing}{self.DIFFERENT_COLUMN_GUIDE}'
827 # --- left-hand side related description ---
829 def lhs_only_description(self, lhs_fact):
831 lhs = self._lhs_description(lhs_fact)
832 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.LHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)
833 return f'{lhs}{diff_mark_area}'
835 def _lhs_description(self, lhs_fact):
837 lhs_row_number = UnicodeSupport.right_justified(str(lhs_fact.lhs_row_number), self.lhs_max_row_number_length)
838 spacing = ' ' * self.lhs_filler_length
839 lhs_row = UnicodeSupport.left_justified(str(lhs_fact.lhs_row), self.lhs_max_row_length)
840 return f'{lhs_row_number}{spacing}{lhs_row}'
842 def _lhs_empty_description(self):
843 return ' ' * (self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length)
846 # --- right-hand side related description ---
848 def rhs_only_description(self, rhs_fact):
850 empty_lhs = self._lhs_empty_description()
851 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.RHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)
852 rhs = self._rhs_description(rhs_fact)
853 return f'{empty_lhs}{diff_mark_area}{rhs}'
855 def _rhs_description(self, rhs_fact):
857 rhs_row_number = UnicodeSupport.right_justified(str(rhs_fact.rhs_row_number), self.rhs_max_row_number_length)
858 spacing = ' ' * self.rhs_filler_length
859 rhs_row = UnicodeSupport.left_justified(str(rhs_fact.rhs_row), self.rhs_max_row_length)
860 return f'{rhs_row_number}{spacing}{rhs_row}'
863 # --- both sides related description ---
865 def both_description(self, lhs_fact, rhs_fact, value_difference_result):
867 lhs = self._lhs_description(lhs_fact)
868 diff_mark = Mark.HAS_DIFF if value_difference_result.has_difference else Mark.NON_DIFF
869 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + diff_mark + (' ' * self.diff_mark_filler_length_in_rear)
870 rhs = self._rhs_description(rhs_fact)
871 prefix_of_diff_columns = self.PREFIX_of_DIFF_COLUMNS if value_difference_result.has_difference else ''
872 different_columns = str(value_difference_result.different_column_indices) if value_difference_result.has_difference else ''
873 return f'{lhs}{diff_mark_area}{rhs}{prefix_of_diff_columns}{different_columns}'
876 def __init__(self, context, scan_result):
878 super(HorizontalReporter, self).__init__(context)
879 self.cxt = context
881 if context.needs_size_info_for_padding:
882 size_info = scan_result.size_info_for_padding
883 self.template = HorizontalReporter.Template(len(str(size_info.lhs_max_row_number)),
884 size_info.lhs_max_row_length,
885 len(str(size_info.rhs_max_row_number)),
886 size_info.rhs_max_row_length)
887 else:
888 self.template = None
891 # --- report heading related ---
893 def _report_file_name(self):
895 print(self.template.division_string())
896 print(self.template.file_name_description(os.path.basename(self.cxt.lhs_file_name), os.path.basename(self.cxt.rhs_file_name)))
897 print(self.template.division_string())
900 # --- report each cases ---
902 def report_case_of_existed_only_on_lhs(self, lhs_fact):
904 if self.cxt.shows_details:
905 print(self.template.lhs_only_description(lhs_fact))
907 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
909 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:
910 print(self.template.both_description(lhs_fact, rhs_fact, value_difference_result))
912 def report_case_of_existed_only_on_rhs(self, rhs_fact):
914 if self.cxt.shows_details:
915 print(self.template.rhs_only_description(rhs_fact))
918class VerticalReporter(DetailReporter):
920 class Template:
922 LHS_MARK = 'L'
923 RHS_MARK = 'R'
924 PREFIX_of_DIFF_COLUMNS = '@'
926 def __init__(self):
927 pass
930 # --- heading-related description ---
932 @classmethod
933 def division_string(cls):
934 return '-' * 80
936 @classmethod
937 def file_name_description(cls, mark, file_name):
938 return f'{mark} {file_name}'
941 # --- left-hand side related description ---
943 @classmethod
944 def lhs_only_description(cls, lhs_fact):
945 return f'{Mark.LHS_ONLY} {cls.LHS_MARK} {str(lhs_fact.lhs_row_number)} {str(lhs_fact.lhs_row)}'
948 # --- right-hand side related description ---
950 @classmethod
951 def rhs_only_description(cls, rhs_fact):
952 return f'{Mark.RHS_ONLY} {cls.RHS_MARK} {str(rhs_fact.rhs_row_number)} {str(rhs_fact.rhs_row)}'
955 # --- both sides related description ---
957 @classmethod
958 def both_description_heading(cls, value_difference_result):
960 if value_difference_result.has_difference:
961 return f'{Mark.HAS_DIFF} {cls.PREFIX_of_DIFF_COLUMNS} {str(value_difference_result.different_column_indices)}'
962 else:
963 return Mark.NON_DIFF_EXPRESSLY
965 @classmethod
966 def both_description_lhs(cls, lhs_fact, row_number_length):
967 return f' {cls.LHS_MARK} {str(lhs_fact.lhs_row_number).rjust(row_number_length)} {str(lhs_fact.lhs_row)}'
969 @classmethod
970 def both_description_rhs(cls, rhs_fact, row_number_length):
971 return f' {cls.RHS_MARK} {str(rhs_fact.rhs_row_number).rjust(row_number_length)} {str(rhs_fact.rhs_row)}'
975 def __init__(self, context, _):
977 super(VerticalReporter, self).__init__(context)
978 self.cxt = context
979 self.template = VerticalReporter.Template()
982 # --- report heading related ---
984 def _report_file_name(self):
986 print(self.template.division_string())
987 print(self.template.file_name_description(self.template.LHS_MARK, os.path.basename(self.cxt.lhs_file_name)))
988 print(self.template.file_name_description(self.template.RHS_MARK, os.path.basename(self.cxt.rhs_file_name)))
989 print(self.template.division_string())
992 # --- report each cases ---
994 def report_case_of_existed_only_on_lhs(self, lhs_fact):
996 if self.cxt.shows_details:
997 print(self.template.lhs_only_description(lhs_fact))
999 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
1001 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:
1003 row_number_length = max(len(str(lhs_fact.lhs_row_number)), len(str(rhs_fact.rhs_row_number)))
1005 print(self.template.both_description_heading(value_difference_result))
1006 print(self.template.both_description_lhs(lhs_fact, row_number_length))
1007 print(self.template.both_description_rhs(rhs_fact, row_number_length))
1009 def report_case_of_existed_only_on_rhs(self, rhs_fact):
1011 if self.cxt.shows_details:
1012 print(self.template.rhs_only_description(rhs_fact))
1015class CountReporter:
1017 class Counter:
1019 def __init__(self):
1021 self.number_of_same_lines = 0
1022 self.number_of_lhs_only = 0
1023 self.number_of_rhs_only = 0
1024 self.number_of_differences = 0
1026 self.row_numbers_for_lhs_only = []
1027 self.row_numbers_for_rhs_only = []
1028 self.row_numbers_for_differences = {}
1030 self._max_digit = None
1032 def _increment_same_lines(self):
1033 self.number_of_same_lines += 1
1035 def _increment_lhs_only(self):
1036 self.number_of_lhs_only += 1
1038 def _increment_rhs_only(self):
1039 self.number_of_rhs_only += 1
1041 def _increment_differences(self):
1042 self.number_of_differences += 1
1044 def _add_row_number_for_lhs_only(self, row_number):
1045 self.row_numbers_for_lhs_only.append(row_number)
1047 def _add_row_number_for_rhs_only(self, row_number):
1048 self.row_numbers_for_rhs_only.append(row_number)
1050 def _add_row_number_for_differences(self, lhs_row_number, rhs_row_number):
1051 self.row_numbers_for_differences[lhs_row_number] = rhs_row_number
1054 def count_for_case_of_existed_only_on_lhs(self, row_number):
1055 self._increment_lhs_only()
1056 self._add_row_number_for_lhs_only(row_number)
1058 def count_for_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):
1060 if value_difference_result.has_difference:
1061 self._increment_differences()
1062 self._add_row_number_for_differences(lhs_fact.lhs_row_number, rhs_fact.rhs_row_number)
1063 else:
1064 self._increment_same_lines()
1066 def count_for_case_of_existed_only_on_rhs(self, row_number):
1067 self._increment_rhs_only()
1068 self._add_row_number_for_rhs_only(row_number)
1070 @property
1071 def sorted_row_numbers_for_differences(self):
1072 return sorted(self.row_numbers_for_differences.items(), key=lambda x: x[0])
1075 @property
1076 def max_digit(self):
1078 if self._max_digit is not None:
1079 return self._max_digit
1081 self._max_digit = max(
1082 len(str(self.number_of_same_lines)),
1083 len(str(self.number_of_lhs_only)),
1084 len(str(self.number_of_rhs_only)),
1085 len(str(self.number_of_differences)),
1086 )
1087 return self._max_digit
1090 def __init__(self, shows_count):
1091 self.shows_count = shows_count
1092 self.counter = self.Counter()
1095 def _func_of_right_justified_number(self):
1096 return lambda number: str(number).rjust(self.counter.max_digit)
1098 @spacing_before(1)
1099 def report_count(self):
1101 if not self.shows_count:
1102 return
1104 print('● Count & Row number')
1106 rjust = self._func_of_right_justified_number()
1107 print('same lines : {}'.format(rjust(self.counter.number_of_same_lines)))
1108 print('left side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.LHS_ONLY, rjust(self.counter.number_of_lhs_only), self.counter.row_numbers_for_lhs_only))
1109 print('right side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.RHS_ONLY, rjust(self.counter.number_of_rhs_only), self.counter.row_numbers_for_rhs_only))
1110 print('with differences ({}): {} :-- Row Number Pairs -->: {}'.format(Mark.HAS_DIFF, rjust(self.counter.number_of_differences), self.counter.sorted_row_numbers_for_differences))
1113class UnicodeSupport:
1115 @classmethod
1116 def left_justified(cls, value, length):
1117 return f"{value}{' ' * (length - cls.string_length_considering_east_asian_characters_of(value))}"
1119 @classmethod
1120 def right_justified(cls, value, length):
1121 return f"{' ' * (length - cls.string_length_considering_east_asian_characters_of(value))}{value}"
1123 @staticmethod
1124 def string_length_considering_east_asian_characters_of(text):
1125 return functools.reduce(lambda counting, c: counting + (2 if unicodedata.east_asian_width(c) in 'FWA' else 1),
1126 text, 0)
1129# ----------------------------------------------------------------------------------------------------------------------
1130# CSV Reading
1131# ----------------------------------------------------------------------------------------------------------------------
1133class FileArrangement(type):
1135 LHS = '_for_lhs'
1136 RHS = '_for_rhs'
1139class CsvDialectFixer:
1141 def __init__(self):
1142 pass
1144 @classmethod
1145 def fixed_dialect(cls, context, csv_file, file_arrangement):
1147 if context.forces_individual_specs:
1148 return cls._dialect_from_context(context, file_arrangement)
1149 else:
1150 return cls._try_sniffing(context, csv_file, file_arrangement)
1153 @classmethod
1154 def _dialect_from_context(cls, context, file_arrangement):
1156 dialect = csv.excel()
1157 dialect.delimiter = getattr(context, "column_separator" + file_arrangement)
1158 dialect.lineterminator = getattr(context, "line_separator" + file_arrangement)
1159 dialect.quotechar = getattr(context, "quote_char" + file_arrangement)
1160 dialect.skipinitialspace = getattr(context, "skips_space_after_column_separator" + file_arrangement)
1162 return dialect, context
1164 @classmethod
1165 def _try_sniffing(cls, context, csv_file, file_arrangement):
1167 try:
1168 return cls._sniff(context, csv_file, file_arrangement)
1170 except csv.Error as e:
1172 logger.warning(f'Sniffing failed. Generated a dialect from context instead. [type={type(e)}, args={str(e.args)}, message={traceback.format_exception_only(type(e), e)}]')
1173 return cls._dialect_from_context(context, file_arrangement)
1175 finally:
1176 csv_file.seek(0)
1178 @classmethod
1179 def _sniff(cls, context, csv_file, file_arrangement):
1181 sample = csv_file.read(context.sniffing_size)
1182 sniffer = csv.Sniffer()
1183 dialect = sniffer.sniff(sample)
1184 has_header = sniffer.has_header(sample)
1186 adjusted_context = cls._adjust_context_with(dialect, has_header, context, file_arrangement)
1188 return dialect, adjusted_context
1190 @classmethod
1191 def _adjust_context_with(cls, dialect, has_header, context, file_arrangement):
1193 setattr(context, "column_separator" + file_arrangement, dialect.delimiter)
1194 setattr(context, "line_separator" + file_arrangement, dialect.lineterminator)
1195 setattr(context, "quote_char" + file_arrangement, dialect.quotechar)
1196 setattr(context, "skips_space_after_column_separator" + file_arrangement, dialect.skipinitialspace)
1197 context.first_row_is_header = has_header if context.header is None else (True if context.header == 'y' else False)
1199 return context
1202def show_dialect_for_debugging(dialect, context, message, file_arrangement):
1204 logger.debug(f'---{message}---')
1205 logger.debug(f'sniffing dialect={dialect}')
1206 logger.debug(f'sniffing dialect csv.excel={isinstance(dialect, csv.excel)}')
1207 logger.debug(f'sniffing dialect csv.excel_tab={isinstance(dialect, csv.excel_tab)}')
1208 logger.debug(f'sniffing dialect csv.unix_dialect={isinstance(dialect, csv.unix_dialect)}')
1209 logger.debug(f'sniffing dialect.delimiter={context.display_string_for_column_separator(dialect.delimiter)}')
1210 logger.debug(f'sniffing dialect.doublequote={dialect.doublequote}')
1211 logger.debug(f'sniffing dialect.escapechar={dialect.escapechar}')
1212 logger.debug(f'sniffing dialect.lineterminator={context.display_string_for_line_separator(dialect.lineterminator, file_arrangement)}')
1213 logger.debug(f'sniffing dialect.quotechar={dialect.quotechar}')
1214 logger.debug(f'sniffing dialect.quoting={dialect.quoting}')
1215 logger.debug(f'sniffing dialect.skipinitialspace={dialect.skipinitialspace}')
1219class LhsFact:
1221 def __init__(self, lhs_row_number, lhs_row, lhs_key):
1223 logger.debug(f'LhsFact 生成 lhs_row_number={lhs_row_number}, lhs_row={lhs_row}, lhs_key={lhs_key}')
1225 self.lhs_row_number = lhs_row_number
1226 self.lhs_row = lhs_row
1227 self.lhs_key = lhs_key
1230class RhsFact:
1232 def __init__(self, rhs_row_number, rhs_row, rhs_key):
1234 logger.debug(f'RhsFact 生成 rhs_row_number={rhs_row_number}, rhs_row={rhs_row}, rhs_key={rhs_key}')
1236 self.rhs_row_number = rhs_row_number
1237 self.rhs_row = rhs_row
1238 self.rhs_key = rhs_key
1241class CsvReader:
1243 class State:
1245 def __init__(self, csv_file, dialect, file_name, first_row_is_header):
1247 self._csv_file = csv_file
1248 self._dialect = dialect
1249 self._file_name = file_name
1250 self._first_row_is_header = first_row_is_header
1252 self._csv_reader = csv.reader(csv_file, dialect)
1253 self._row_number = 0
1254 self._previous_key = ""
1256 def reset(self):
1258 self._csv_file.seek(0)
1259 self._csv_reader = csv.reader(self._csv_file, self._dialect)
1260 self._row_number = 0
1261 self._previous_key = ""
1263 def increment_row_number(self):
1265 if self._previous_key == MatchingKeyCodec.END_of_KEY:
1266 return
1268 self._row_number += 1
1270 def key_changed(self, new_key):
1272 if self._is_header():
1273 return
1275 self._previous_key = new_key
1277 def _is_header(self):
1278 return self.row_number == 0 and self._first_row_is_header
1280 @property
1281 def csv_reader(self):
1282 return self._csv_reader
1284 @property
1285 def file_name(self):
1286 return self._file_name
1288 @property
1289 def row_number(self):
1290 return self._row_number
1292 @property
1293 def previous_key(self):
1294 return self._previous_key
1297 def __init__(self, lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, context):
1299 show_dialect_for_debugging(lhs_dialect, context, '左CSV', FileArrangement.LHS)
1300 show_dialect_for_debugging(rhs_dialect, context, '右CSV', FileArrangement.RHS)
1302 self.lhs_csv_state = CsvReader.State(lhs_csv, lhs_dialect, context.lhs_file_name, context.first_row_is_header)
1303 self.rhs_csv_state = CsvReader.State(rhs_csv, rhs_dialect, context.rhs_file_name, context.first_row_is_header)
1304 self.cxt = context
1306 self.skip_header()
1308 def skip_header(self):
1310 if self.cxt.first_row_is_header:
1311 _ = self.read_lhs()
1312 _ = self.read_rhs()
1314 def reset(self):
1316 self.lhs_csv_state.reset()
1317 self.rhs_csv_state.reset()
1318 self.skip_header()
1320 def read_lhs(self):
1322 lhs_row, lhs_key = self._read_csv(self.lhs_csv_state)
1323 self.lhs_csv_state.increment_row_number()
1324 return LhsFact(self.lhs_csv_state.row_number, lhs_row, lhs_key)
1326 def read_rhs(self):
1328 rhs_row, rhs_key = self._read_csv(self.rhs_csv_state)
1329 self.rhs_csv_state.increment_row_number()
1330 return RhsFact(self.rhs_csv_state.row_number, rhs_row, rhs_key)
1332 def _read_csv(self, csv_state):
1334 try:
1335 row = next(csv_state.csv_reader)
1336 except StopIteration:
1337 csv_state.key_changed(MatchingKeyCodec.END_of_KEY)
1338 return [], MatchingKeyCodec.END_of_KEY
1340 new_key = self.cxt.matching_key_codec.managed_key_for(row)
1341 self._detect_key_violation(new_key, csv_state)
1343 csv_state.key_changed(new_key)
1345 return row, new_key
1347 def _detect_key_violation(self, new_key, csv_state):
1349 if csv_state.previous_key == '':
1350 return
1352 if new_key < csv_state.previous_key:
1353 logger.error(f'matching keys in {csv_state.file_name} are not sorted.'
1354 f' [current_key={MatchingKeyCodec.decode_key(new_key)}, previous_key={MatchingKeyCodec.decode_key(csv_state.previous_key)}, matching-key-indices={self.cxt.matching_key_codec.matching_key_info_list}]'
1355 f' If the key is a number without zero padding, specify the max size of the key after colon like -k0:8.')
1356 exit(1)
1358 if self.cxt.key_should_be_unique and new_key == csv_state.previous_key:
1359 logger.error(f'matching keys in {csv_state.file_name} are not unique.'
1360 f' [current_key={MatchingKeyCodec.decode_key(new_key)}, previous_key={MatchingKeyCodec.decode_key(csv_state.previous_key)}, matching-key-indices={self.cxt.matching_key_codec.matching_key_info_list}]')
1361 exit(1)
1364if __name__ == '__main__':
1366 main()