Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python3 

2# -*- coding: utf-8 -*- 

3import abc 

4import binascii 

5import csv 

6import functools 

7import logging 

8import os 

9import sys 

10import time 

11import traceback 

12import unicodedata 

13from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 

14from logging import Logger 

15 

16 

17# ---------------------------------------------------------------------------------------------------------------------- 

18# Decorators 

19# ---------------------------------------------------------------------------------------------------------------------- 

20 

21def show_execution_time(): 

22 

23 def _execution_time(func): 

24 

25 def wrapper(*args, **kwargs): 

26 

27 start = time.perf_counter() 

28 

29 func(*args, **kwargs) 

30 

31 elapsed_time = time.perf_counter() - start 

32 print() 

33 print(f'elapsed_time={elapsed_time}[sec]') 

34 print() 

35 

36 return wrapper 

37 

38 return _execution_time 

39 

40 

41def spacing_before(number_of_lines): 

42 

43 number_of_lines = number_of_lines or 1 

44 

45 def _spacing_before(func): 

46 

47 def wrapper(*args, **kwargs): 

48 

49 for i in range(number_of_lines): 

50 print('') 

51 

52 func(*args, **kwargs) 

53 

54 return wrapper 

55 

56 return _spacing_before 

57 

58 

59# ---------------------------------------------------------------------------------------------------------------------- 

60# Entrance 

61# ---------------------------------------------------------------------------------------------------------------------- 

62 

63# @show_execution_time() 

64def main(): 

65 

66 configure() 

67 

68 context = context_from_arguments() 

69 show_context_for_debugging(context) 

70 

71 try: 

72 run_in(context) 

73 except IndexError as e: 

74 logger.error(f'It is possible that the number of columns in the row is not aligned. Please check the csv data. If not, please file an issue. [{type(e)}, description={e}]') 

75 sys.exit(1) 

76 

77 

78class App(type): 

79 

80 NAME = 'csv-diff-python3@blue-monk' 

81 VERSION = '1.0.0' 

82 

83 

84class LoggingConfig(type): 

85 

86 # For debug, play with the CONSOLE_LEVEL or FILE_LEVEL. 

87 

88 BASE_LEVEL = logging.DEBUG 

89 

90 CONSOLE_LEVEL = logging.ERROR 

91 CONSOLE_FORMAT = '%(levelname)s: %(message)s' 

92 

93 FILE_LEVEL = logging.WARNING 

94 FILE_FORMAT = '%(asctime)s: %(levelname)s: %(message)s' 

95 FILE_PATH = 'csvdiff.log' 

96 

97 

98logger: Logger = logging.getLogger(__name__) 

99 

100 

101def configure(): 

102 

103 logging.basicConfig(level=LoggingConfig.BASE_LEVEL) 

104 

105 stream_handler = logging.StreamHandler() 

106 stream_handler.setLevel(LoggingConfig.CONSOLE_LEVEL) 

107 stream_handler.setFormatter(logging.Formatter(LoggingConfig.CONSOLE_FORMAT)) 

108 

109 file_handler = logging.FileHandler(filename=LoggingConfig.FILE_PATH, mode='w') 

110 file_handler.setLevel(LoggingConfig.FILE_LEVEL) 

111 file_handler.setFormatter(logging.Formatter(LoggingConfig.FILE_FORMAT)) 

112 

113 logger.addHandler(stream_handler) 

114 logger.addHandler(file_handler) 

115 

116 logger.propagate = False 

117 

118 

119# ---------------------------------------------------------------------------------------------------------------------- 

120# Context Preparation 

121# ---------------------------------------------------------------------------------------------------------------------- 

122 

123def context_from_arguments(): 

124 

125 def arg_type_matching_key_in_csv(x): 

126 return list(map(MatchingKeyInfo, x.split(','))) 

127 

128 def arg_type_int_in_csv(x): 

129 return list(map(int, x.split(','))) 

130 

131 

132 parser = ArgumentParser(prog=App.NAME, formatter_class=ArgumentDefaultsHelpFormatter) 

133 

134 # Program name & Version ------------------------------------------------------------------------------------------- 

135 parser.add_argument('--version', action='version', version=f'%(prog)s {App.VERSION}') 

136 

137 # Input CSV file paths --------------------------------------------------------------------------------------------- 

138 parser.add_argument('lhs_file_name', type=str, help='Absolute/Relative path to left-hand side file.') 

139 parser.add_argument('rhs_file_name', type=str, help='Absolute/Relative path to right-hand side file.') 

140 

141 # Input CSV file encodings ----------------------------------------------------------------------------------------- 

142 parser.add_argument('-e', '--encoding', type=str, default=None, 

143 help='Encoding of the CSV files. (refer public reference named "Standard encoding") e.g.: shift_jis') 

144 

145 parser.add_argument('--encoding-for-lhs', type=str, default='utf8', 

146 help='Encoding of the CSV file on the left side. (refer public reference named "Standard encoding") e.g.: shift_jis') 

147 parser.add_argument('--encoding-for-rhs', type=str, default='utf8', 

148 help='Encoding of the CSV file on the right side. (refer public reference named "Standard encoding") e.g.: shift_jis') 

149 

150 # Matching conditions ---------------------------------------------------------------------------------------------- 

151 parser.add_argument('-k', '--matching-keys', type=arg_type_matching_key_in_csv, default='0', 

152 help='Matching key indices(from 0) for Input CSV in CSV format. For non-fixed length numbers, specify the number of digits after ":". e.g.: 0:8,3') 

153 parser.add_argument('-u', '--unique-key', default=False, action='store_true', 

154 help="Specify if the matching key is unique. Then, if it detects that the matching key is not unique, an error will occur.") 

155 parser.add_argument('-i', '--ignore-columns', type=arg_type_int_in_csv, default=[], 

156 help='Specify the index of the column to be ignored in CSV format. e.g.: 3,7') 

157 

158 # Report styles ---------------------------------------------------------------------------------------------------- 

159 parser.add_argument('-v', '--vertical-style', default=False, action='store_true', 

160 help='Report in vertical style. If not specified, report in horizontal(two facing) style.') 

161 

162 parser.add_argument('-c', '--show-count', default=False, action='store_true', 

163 help='Report the number of differences. Treat this as True if neither -d nor -a is specified.') 

164 

165 display_group = parser.add_mutually_exclusive_group() 

166 display_group.add_argument('-d', '--show-difference-only', default=False, action='store_true', 

167 help='Report the lines with the difference. Can be used with option -c. Cannot be used with option -a.') 

168 display_group.add_argument('-a', '--show-all-lines', action='store_true', 

169 help='Report on all lines. Can be used with option -c. Cannot be used with option -d.') 

170 

171 parser.add_argument('-x', '--show-context-from-arguments', default=False, action='store_true', 

172 help='Report the context generated from the arguments and CSV sniffing.') 

173 

174 # CSV analysis conditions ------------------------------------------------------------------------------------------ 

175 parser.add_argument('-H', '--header', type=str, default=None, choices=['n', 'y'], 

176 help='If specified, this specification will be enforced.') 

177 

178 parser.add_argument('-S', '--sniffing-size', type=str, default=4096, 

179 help="If csv sniffing fails, try specifying a size larger than 4096. Or Explicitly specify CSV file conditions like '--column-separator-for-lhs TAB'. Check help with -h option.") 

180 

181 parser.add_argument('-F', '--force-individual-specs', action='store_true', 

182 help="If you don't want to rely on csv sniffing, specify it, and then specify --column-separator and so on separately.") 

183 

184 parser.add_argument('--column-separator', type=str, default=None, choices=['COMMA', 'TAB', 'SEMICOLON'], 

185 help='Process both sides CSV file using the specified column delimiter.') 

186 

187 parser.add_argument('--line-separator', type=str, default=None, choices=['LF', 'CRLF'], 

188 help='Process both sides CSV file using the specified line separator.') 

189 

190 parser.add_argument('--quote-char', type=str, default=None, choices=['"', "'"], 

191 help='Process both sides CSV file using the specified quote character.') 

192 

193 parser.add_argument('--no-skip-space-after-column-separator', action='store_true', 

194 help='Specify when you want to treat the space immediately after the separator as data for the both sides CSV file.') 

195 

196 # CSV analysis conditions by left and right ------------------------------------------------------------------------ 

197 parser.add_argument('--column-separator-for-lhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'], 

198 help='Process left-hand side CSV file using the specified column delimiter.') 

199 

200 parser.add_argument('--column-separator-for-rhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'], 

201 help='Process right-hand side CSV file using the specified column delimiter.') 

202 

203 parser.add_argument('--line-separator-for-lhs', type=str, default="LF", choices=['LF', 'CRLF'], 

204 help='Process left-hand side CSV file using the specified line separator.') 

205 

206 parser.add_argument('--line-separator-for-rhs', type=str, default="LF", choices=['LF', 'CRLF'], 

207 help='Process right-hand side CSV file using the specified line separator.') 

208 

209 parser.add_argument('--quote-char-for-lhs', type=str, default='"', choices=['"', "'"], 

210 help='Process left-hand side CSV file using the specified quote character.') 

211 

212 parser.add_argument('--quote-char-for-rhs', type=str, default='"', choices=['"', "'"], 

213 help='Process right-hand side CSV file using the specified quote character.') 

214 

215 parser.add_argument('--no-skip-space-after-column-separator-for-lhs', default=False, action='store_true', 

216 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the left side.') 

217 

218 parser.add_argument('--no-skip-space-after-column-separator-for-rhs', default=False, action='store_true', 

219 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the right side.') 

220 

221 # ------------------------------------------------------------------------------------------------------------------ 

222 

223 return Context(parser.parse_args()) 

224 

225 

226class Context: 

227 

228 LINE_SEPARATOR_s = { 

229 "CR": '\r', 

230 "LF": '\n', 

231 "CRLF": '\r\n', 

232 None: '<None>', 

233 } 

234 

235 COLUMN_SEPARATOR_s = { 

236 "COMMA": ',', 

237 "TAB": '\t', 

238 "SEMICOLON": ';', 

239 None: '<None>', 

240 } 

241 

242 def __init__(self, args): 

243 

244 # Input CSV file paths ----------------------------------------------------------------------------------------- 

245 self.lhs_file_name = args.lhs_file_name 

246 self.rhs_file_name = args.rhs_file_name 

247 self.lhs_file_path = os.path.abspath(args.lhs_file_name) 

248 self.rhs_file_path = os.path.abspath(args.rhs_file_name) 

249 

250 # Input CSV file encodings ------------------------------------------------------------------------------------- 

251 if args.encoding: 

252 self.encoding_for_lhs = args.encoding 

253 self.encoding_for_rhs = args.encoding 

254 else: 

255 self.encoding_for_lhs = args.encoding_for_lhs 

256 self.encoding_for_rhs = args.encoding_for_rhs 

257 

258 # Matching conditions ------------------------------------------------------------------------------------------ 

259 self.matching_key_codec = MatchingKeyCodec(args.matching_keys) 

260 self.key_should_be_unique = args.unique_key 

261 self.column_indices_to_ignore = args.ignore_columns 

262 

263 # Report styles ------------------------------------------------------------------------------------------------ 

264 self.reports_in_vertical_style = args.vertical_style 

265 self.reports_in_horizontal_style = not args.vertical_style 

266 

267 self.shows_count = args.show_count 

268 self.shows_difference_only = args.show_difference_only 

269 self.shows_all_lines = args.show_all_lines 

270 self.shows_details = True if self.shows_difference_only or self.shows_all_lines else False 

271 self.shows_context_from_arguments = args.show_context_from_arguments 

272 

273 self.needs_size_info_for_padding = self.shows_details and self.reports_in_horizontal_style 

274 

275 # CSV analysis conditions -------------------------------------------------------------------------------------- 

276 self.header = args.header 

277 self.first_row_is_header = None 

278 

279 self.sniffing_size = args.sniffing_size 

280 

281 self.forces_individual_specs = args.force_individual_specs 

282 

283 if self.forces_individual_specs and args.column_separator: 

284 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator] 

285 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator] 

286 else: 

287 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_lhs] 

288 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_rhs] 

289 

290 if self.forces_individual_specs and args.line_separator: 

291 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator] 

292 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator] 

293 else: 

294 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator_for_lhs] 

295 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator_for_rhs] 

296 

297 if self.forces_individual_specs and args.quote_char: 

298 self.quote_char_for_lhs = args.quote_char 

299 self.quote_char_for_rhs = args.quote_char 

300 else: 

301 self.quote_char_for_lhs = args.quote_char_for_lhs 

302 self.quote_char_for_rhs = args.quote_char_for_rhs 

303 

304 if self.forces_individual_specs and args.no_skip_space_after_column_separator: 

305 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator 

306 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator 

307 else: 

308 self.skips_space_after_column_separator_for_lhs = True 

309 self.skips_space_after_column_separator_for_rhs = True 

310 

311 if self.forces_individual_specs and args.no_skip_space_after_column_separator: 

312 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator 

313 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator 

314 else: 

315 self.skips_space_after_column_separator_for_lhs = True 

316 self.skips_space_after_column_separator_for_rhs = True 

317 

318 

319 self._validate() 

320 self._normalize() 

321 

322 def _validate(self): 

323 

324 if not os.path.exists(self.lhs_file_path): 

325 logger.error(f'lhs_file_path not exists. [lhs_file_path={self.lhs_file_path}]') 

326 sys.exit(1) 

327 if not os.path.exists(self.rhs_file_path): 

328 logger.error(f'rhs_file_path not exists. [rhs_file_path={self.rhs_file_path}]') 

329 sys.exit(1) 

330 

331 if not os.path.isfile(self.lhs_file_path): 

332 logger.error(f'lhs_file_path is not a file. [lhs_file_path={self.lhs_file_path}]') 

333 sys.exit(1) 

334 if not os.path.isfile(self.rhs_file_path): 

335 logger.error(f'rhs_file_path is not a file. [rhs_file_path={self.rhs_file_path}]') 

336 sys.exit(1) 

337 

338 def _normalize(self): 

339 

340 if not any([self.shows_count, self.shows_difference_only, self.shows_all_lines]): 

341 self.shows_count = True 

342 

343 def display_string_for_column_separator(self, value): 

344 

345 candidates = [k for k, v in self.COLUMN_SEPARATOR_s.items() if v == value] 

346 if candidates: 

347 return candidates[0] 

348 else: 

349 f'undefined({value})' 

350 

351 def display_string_for_line_separator(self, value, file_arrangement): 

352 

353 encoding_value = getattr(self, "encoding" + file_arrangement) 

354 return binascii.hexlify(value.encode(encoding_value)).decode() 

355 

356 

357def show_context_for_debugging(cxt): 

358 

359 logger.debug(f'lhs_file_name={cxt.lhs_file_name}') 

360 logger.debug(f'rhs_file_name={cxt.rhs_file_name}') 

361 logger.debug(f'lhs_file_path={cxt.lhs_file_path}') 

362 logger.debug(f'rhs_file_path={cxt.rhs_file_path}') 

363 

364 logger.debug(f'encoding_for_lhs={cxt.encoding_for_lhs}') 

365 logger.debug(f'encoding_for_rhs={cxt.encoding_for_rhs}') 

366 

367 logger.debug(f'matching_key_codec={cxt.matching_key_codec}') 

368 logger.debug(f'key_should_be_unique={cxt.key_should_be_unique}') 

369 logger.debug(f'column_indices_to_ignore={cxt.column_indices_to_ignore}') 

370 

371 logger.debug(f'reports_in_vertical_style={cxt.reports_in_vertical_style}') 

372 logger.debug(f'reports_in_horizontal_style={cxt.reports_in_horizontal_style}') 

373 logger.debug(f'shows_count={cxt.shows_count}') 

374 logger.debug(f'shows_difference_only={cxt.shows_difference_only}') 

375 logger.debug(f'shows_all_lines={cxt.shows_all_lines}') 

376 logger.debug(f'shows_context_from_arguments={cxt.shows_context_from_arguments}') 

377 logger.debug(f'needs_size_info_for_padding={cxt.needs_size_info_for_padding}') 

378 

379 logger.debug(f'first_row_is_header={cxt.first_row_is_header}') 

380 logger.debug(f'sniffing_size={cxt.sniffing_size}') 

381 logger.debug(f'force_individual_specs={cxt.forces_individual_specs}') 

382 

383 logger.debug(f'column_separator_for_lhs={cxt.display_string_for_column_separator(cxt.column_separator_for_lhs)}') 

384 logger.debug(f'column_separator_for_rhs={cxt.display_string_for_column_separator(cxt.column_separator_for_rhs)}') 

385 logger.debug(f'line_separator_for_lhs={cxt.display_string_for_line_separator(cxt.line_separator_for_lhs, FileArrangement.LHS)}') 

386 logger.debug(f'line_separator_for_rhs={cxt.display_string_for_line_separator(cxt.line_separator_for_rhs, FileArrangement.RHS)}') 

387 logger.debug(f'quote_char_for_lhs={cxt.quote_char_for_lhs}') 

388 logger.debug(f'quote_char_for_rhs={cxt.quote_char_for_rhs}') 

389 logger.debug(f'skips_space_after_column_separator_for_lhs={cxt.skips_space_after_column_separator_for_lhs}') 

390 logger.debug(f'skips_space_after_column_separator_for_rhs={cxt.skips_space_after_column_separator_for_rhs}') 

391 

392 logger.debug(f'MatchingKeyCodec#END_of_KEY={MatchingKeyCodec.END_of_KEY}') 

393 

394 

395# ---------------------------------------------------------------------------------------------------------------------- 

396# Matching Key Treatment 

397# ---------------------------------------------------------------------------------------------------------------------- 

398 

399class MatchingKeyInfo: 

400 

401 def __init__(self, specified_string): 

402 

403 elements = list(filter(lambda x: x != '', specified_string.split(':'))) 

404 

405 index = elements.pop(0) 

406 self.index = self._transform_into_numeric(index, 'index') 

407 

408 max_length = elements.pop(0) if elements else '0' 

409 self.max_length = self._transform_into_numeric(max_length, 'max_length') 

410 

411 def __repr__(self): 

412 return f"{self.__class__.__name__}({self.index!r}, {(self.max_length if self.max_length > 0 else '<not specified>')!r})" 

413 

414 @classmethod 

415 def _transform_into_numeric(cls, value, name): 

416 

417 if not value.isdigit(): 

418 logger.error(f'MATCHING_KEY_INDICES should be a number. See also help. [specified {name}={value}]') 

419 exit(1) 

420 

421 return int(value) 

422 

423 def key_for(self, row): 

424 return row[self.index].rjust(self.max_length, '0') 

425 

426 

427class MatchingKeyCodec: 

428 

429 END_of_KEY = 'ZZZ' 

430 SEPARATOR = '..' 

431 

432 def __init__(self, matching_key_info_list): 

433 self.matching_key_info_list = matching_key_info_list 

434 

435 def __repr__(self): 

436 return f'{self.__class__.__name__}({self.matching_key_info_list!r})' 

437 

438 def managed_key_for(self, row): 

439 

440 try: 

441 return functools.reduce(lambda making, matching_key: making + matching_key.key_for(row) + self.SEPARATOR, 

442 self.matching_key_info_list, self.SEPARATOR) 

443 except IndexError: 

444 logger.error(f'one of the indices specified for MATCHING_KEY_INDICES is out of range [MATCHING_KEY_INDICES={self.matching_key_info_list}, number of columns = {len(row)}, row={row}]') 

445 exit(1) 

446 

447 @property 

448 def matching_key_indices(self): 

449 return list(map(lambda matching_key_info: matching_key_info.index, self.matching_key_info_list)) 

450 

451 @classmethod 

452 def decode_key(cls, key): 

453 """ Leave the padding as it is. """ 

454 return key.strip(cls.SEPARATOR).split(cls.SEPARATOR) 

455 

456 

457 

458# ---------------------------------------------------------------------------------------------------------------------- 

459# Control and Determine if it exists only on the left, only on the right, or both 

460# ---------------------------------------------------------------------------------------------------------------------- 

461 

462def run_in(context): 

463 

464 with open(context.lhs_file_path, mode='r', encoding=context.encoding_for_lhs) as lhs_csv,\ 

465 open(context.rhs_file_path, mode='r', encoding=context.encoding_for_rhs) as rhs_csv: 

466 

467 lhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(context, lhs_csv, FileArrangement.LHS) 

468 rhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(adjusted_context, rhs_csv, FileArrangement.RHS) 

469 

470 csv_reader = CsvReader(lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, adjusted_context) 

471 pre_scan_result = PreScanner.scan(adjusted_context, csv_reader) 

472 csv_reader.reset() 

473 

474 detect_diff(adjusted_context, csv_reader, pre_scan_result) 

475 

476 

477def detect_diff(context, csv_reader, pre_scan_result): 

478 

479 value_difference_detector = ValueDifferenceDetector(pre_scan_result.number_of_columns, 

480 context.matching_key_codec.matching_key_indices, 

481 context.column_indices_to_ignore) 

482 

483 heading_reporter = HeadingReporter(context) 

484 detail_reporter = DetailReporter.Factory.reporter_for(context, pre_scan_result) 

485 count_reporter = CountReporter(context.shows_count) 

486 counter = count_reporter.counter 

487 

488 heading_reporter.report_heading() 

489 detail_reporter.report_detail_heading() 

490 

491 

492 def existed_only_on_lhs(lhs_fact): 

493 counter.count_for_case_of_existed_only_on_lhs(lhs_fact.lhs_row_number) 

494 detail_reporter.report_case_of_existed_only_on_lhs(lhs_fact) 

495 

496 def existed_on_both_sides(lhs_fact, rhs_fact): 

497 value_difference_result = value_difference_detector.detect_difference_between(lhs_fact.lhs_row, rhs_fact.rhs_row) 

498 counter.count_for_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result) 

499 detail_reporter.report_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result) 

500 

501 def existed_only_on_rhs(rhs_fact): 

502 counter.count_for_case_of_existed_only_on_rhs(rhs_fact.rhs_row_number) 

503 detail_reporter.report_case_of_existed_only_on_rhs(rhs_fact) 

504 

505 perform_key_matching(csv_reader, existed_only_on_lhs, existed_on_both_sides, existed_only_on_rhs) 

506 

507 

508 count_reporter.report_count() 

509 

510 

511def perform_key_matching(csv_reader, callback_for_lhs_only, callback_for_both_sides, callback_for_rhs_only): 

512 

513 lhs_fact = csv_reader.read_lhs() 

514 rhs_fact = csv_reader.read_rhs() 

515 

516 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY or rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY: 

517 

518 if lhs_fact.lhs_key < rhs_fact.rhs_key: 

519 callback_for_lhs_only(lhs_fact) 

520 lhs_fact = csv_reader.read_lhs() 

521 

522 elif lhs_fact.lhs_key == rhs_fact.rhs_key: 

523 callback_for_both_sides(lhs_fact, rhs_fact) 

524 lhs_fact = csv_reader.read_lhs() 

525 rhs_fact = csv_reader.read_rhs() 

526 

527 elif lhs_fact.lhs_key > rhs_fact.rhs_key: 

528 callback_for_rhs_only(rhs_fact) 

529 rhs_fact = csv_reader.read_rhs() 

530 

531 

532# ---------------------------------------------------------------------------------------------------------------------- 

533# Value-Difference Detection 

534# ---------------------------------------------------------------------------------------------------------------------- 

535 

536class ValueDifferenceDetector: 

537 

538 class ValueDifferenceResult: 

539 

540 def __init__(self, different_column_indices): 

541 

542 self.different_column_indices = different_column_indices 

543 

544 @property 

545 def has_difference(self): 

546 return True if self.different_column_indices else False 

547 

548 

549 def __init__(self, number_of_columns, matching_key_indices, ignore_column_indices): 

550 

551 self.column_indices = range(0, number_of_columns) 

552 logger.debug(f'column_indices={self.column_indices}') 

553 

554 self.target_column_indices = set(self.column_indices) - set(matching_key_indices) - set(ignore_column_indices) 

555 logger.debug(f'target_column_indices={self.target_column_indices}') 

556 

557 def detect_difference_between(self, lhs_row, rhs_row): 

558 

559 different_column_indices = [index for index in self.target_column_indices if lhs_row[index] != rhs_row[index]] 

560 logger.debug(f'different_column_indices={different_column_indices}') 

561 return self.ValueDifferenceResult(different_column_indices) 

562 

563 

564 

565# ---------------------------------------------------------------------------------------------------------------------- 

566# Reporting 

567# ---------------------------------------------------------------------------------------------------------------------- 

568 

569class PreScanner: 

570 

571 class ScanResult: 

572 

573 def __init__(self, number_of_columns, size_info_for_padding): 

574 self.number_of_columns = number_of_columns 

575 self.size_info_for_padding = size_info_for_padding 

576 

577 @classmethod 

578 def for_lightly(cls, number_of_columns): 

579 return PreScanner.ScanResult(number_of_columns, None) 

580 

581 @classmethod 

582 def for_deeply(cls, number_of_columns, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length): 

583 size_info_for_padding = cls.SizeInfoForPadding(lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length) 

584 return PreScanner.ScanResult(number_of_columns, size_info_for_padding) 

585 

586 

587 class SizeInfoForPadding: 

588 

589 def __init__(self, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length): 

590 self.lhs_max_row_number = lhs_max_row_number 

591 self.lhs_max_row_length = lhs_max_row_length 

592 self.rhs_max_row_number = rhs_max_row_number 

593 self.rhs_max_row_length = rhs_max_row_length 

594 

595 

596 def __init__(self): 

597 pass 

598 

599 @classmethod 

600 def scan(cls, context, csv_reader): 

601 

602 if context.needs_size_info_for_padding: 

603 return PreScanner._scan_deeply(csv_reader) 

604 else: 

605 return PreScanner._scan_lightly(csv_reader) 

606 

607 

608 @classmethod 

609 def _scan_deeply(cls, csv_reader): 

610 """ 

611 Notes 

612 ----- 

613 Purpose of deep pre-scanning 

614 * Determine the number of columns for value difference detection 

615 * Get size information to format the horizontal report 

616 """ 

617 

618 start_ = time.perf_counter() 

619 

620 lhs_max_row_length, rhs_max_row_length = 0, 0 

621 

622 lhs_fact = csv_reader.read_lhs() 

623 rhs_fact = csv_reader.read_rhs() 

624 

625 number_of_columns = cls._determine_number_of_columns_from(lhs_fact, rhs_fact) 

626 

627 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY: 

628 lhs_max_row_length = max(lhs_max_row_length, UnicodeSupport.string_length_considering_east_asian_characters_of(str(lhs_fact.lhs_row))) 

629 lhs_fact = csv_reader.read_lhs() 

630 

631 while rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY: 

632 rhs_max_row_length = max(rhs_max_row_length, UnicodeSupport.string_length_considering_east_asian_characters_of(str(rhs_fact.rhs_row))) 

633 rhs_fact = csv_reader.read_rhs() 

634 

635 lhs_max_row_number = csv_reader.lhs_csv_state.row_number 

636 rhs_max_row_number = csv_reader.rhs_csv_state.row_number 

637 logger.debug(f'lhs_max_row_number={lhs_max_row_number}') 

638 logger.debug(f'rhs_max_row_number={rhs_max_row_number}') 

639 

640 elapsed_time_ = time.perf_counter() - start_ 

641 logger.debug(f'PreScanner#scan() elapsed_time:{elapsed_time_}[sec]') 

642 return PreScanner.ScanResult.for_deeply(number_of_columns, 

643 lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length) 

644 

645 @classmethod 

646 def _scan_lightly(cls, csv_reader): 

647 """ 

648 Notes 

649 ----- 

650 Purpose of light pre-scanning 

651 * Determine the number of columns for value difference detection 

652 

653 Vertical reports do not require size information for formatting. 

654 """ 

655 

656 lhs_fact = csv_reader.read_lhs() 

657 rhs_fact = csv_reader.read_rhs() 

658 

659 return PreScanner.ScanResult.for_lightly(cls._determine_number_of_columns_from(lhs_fact, rhs_fact)) 

660 

661 @classmethod 

662 def _determine_number_of_columns_from(cls, lhs_fact, rhs_fact): 

663 

664 number_of_columns = 0 

665 if lhs_fact.lhs_row: 

666 number_of_columns = len(lhs_fact.lhs_row) 

667 elif rhs_fact.rhs_row: 

668 number_of_columns = len(rhs_fact.rhs_row) 

669 

670 return number_of_columns 

671 

672 

673 

674class Mark(type): 

675 

676 LHS_ONLY = '<' 

677 RHS_ONLY = '>' 

678 HAS_DIFF = '!' 

679 NON_DIFF = ' ' 

680 NON_DIFF_EXPRESSLY = '=' 

681 

682 

683class HeadingReporter: 

684 

685 def __init__(self, context): 

686 self.cxt = context 

687 

688 

689 def report_heading(self): 

690 

691 self._report_title() 

692 

693 if self.cxt.shows_context_from_arguments: 

694 self._report_context() 

695 

696 @classmethod 

697 @spacing_before(1) 

698 def _report_title(cls): 

699 print('============ Report ============') 

700 

701 @spacing_before(1) 

702 def _report_context(self): 

703 

704 print('● Context') 

705 print(f'File Path on the Left-Hand Side: {self.cxt.lhs_file_path}') 

706 print(f'File Path on the Right-Hand Side : {self.cxt.rhs_file_path}') 

707 print(f'Matching Key Indices: {self.cxt.matching_key_codec.matching_key_info_list}') 

708 print(f'Matching Key Is Unique?: {self.cxt.key_should_be_unique}') 

709 print(f'Column Indices to Ignore: {self.cxt.column_indices_to_ignore}') 

710 print(f'with Header?: {self.cxt.first_row_is_header}') 

711 print(f'Report Style: {"Vertical" if self.cxt.reports_in_vertical_style else "Two facing (Horizontal)"}') 

712 print(f'Show Count?: {self.cxt.shows_count}') 

713 print(f'Show Difference Only?: {self.cxt.shows_difference_only}') 

714 print(f'Show All?: {self.cxt.shows_all_lines}') 

715 print(f'Show Context?: {self.cxt.shows_context_from_arguments}') 

716 print(f'File Encoding for Left-Hand Side: {self.cxt.encoding_for_lhs}') 

717 print(f'File Encoding for Right-Hand Side: {self.cxt.encoding_for_rhs}') 

718 print(f'CSV Sniffing Size: {self.cxt.sniffing_size}') 

719 print('--- csv analysis conditions ---') 

720 print(f'Forces Individual Specified Conditions?: {self.cxt.forces_individual_specs}') 

721 print(f'column_separator_for_lhs: {self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_lhs)}') 

722 print(f'column_separator_for_rhs: {self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_rhs)}') 

723 print(f'line_separator_for_lhs: {self.cxt.display_string_for_line_separator(self.cxt.line_separator_for_lhs, FileArrangement.LHS)}') 

724 print(f'line_separator_for_rhs: {self.cxt.display_string_for_line_separator(self.cxt.line_separator_for_rhs, FileArrangement.RHS)}') 

725 print(f'quote_char_for_lhs: {self.cxt.quote_char_for_lhs}') 

726 print(f'quote_char_for_rhs: {self.cxt.quote_char_for_rhs}') 

727 print(f'skips_space_after_column_separator_for_lhs: {self.cxt.skips_space_after_column_separator_for_lhs}') 

728 print(f'skips_space_after_column_separator_for_rhs: {self.cxt.skips_space_after_column_separator_for_rhs}') 

729 

730 

731class DetailReporter: 

732 

733 __metaclass__ = abc.ABCMeta 

734 

735 def __init__(self, context): 

736 self.cxt = context 

737 

738 

739 def report_detail_heading(self): 

740 

741 if not self.cxt.shows_details: 

742 return 

743 

744 self._report_content_heading() 

745 self._report_file_name() 

746 

747 @spacing_before(1) 

748 def _report_content_heading(self): 

749 if self.cxt.shows_difference_only: 

750 print('● Differences') 

751 elif self.cxt.shows_all_lines: 

752 print('● All') 

753 else: 

754 pass 

755 

756 @abc.abstractmethod 

757 def _report_file_name(self): 

758 raise NotImplementedError() 

759 

760 

761 @abc.abstractmethod 

762 def report_case_of_existed_only_on_lhs(self, lhs_fact): 

763 raise NotImplementedError() 

764 

765 @abc.abstractmethod 

766 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

767 raise NotImplementedError() 

768 

769 @abc.abstractmethod 

770 def report_case_of_existed_only_on_rhs(self, rhs_fact): 

771 raise NotImplementedError() 

772 

773 

774 class Factory: 

775 

776 def __init__(self): 

777 pass 

778 

779 @staticmethod 

780 def reporter_for(context, scan_result): 

781 

782 if context.reports_in_vertical_style: 

783 return VerticalReporter(context, scan_result) 

784 else: 

785 return HorizontalReporter(context, scan_result) 

786 

787 

788class HorizontalReporter(DetailReporter): 

789 

790 class Template: 

791 

792 DIFFERENT_COLUMN_GUIDE = 'Column indices with difference' 

793 PREFIX_of_DIFF_COLUMNS = ' @ ' 

794 

795 def __init__(self, lhs_max_row_number_length, lhs_max_row_length, rhs_max_row_number_length, rhs_max_row_length): 

796 

797 self.lhs_max_row_number_length = lhs_max_row_number_length 

798 self.lhs_filler_length = 1 

799 self.lhs_max_row_length = lhs_max_row_length 

800 self.diff_mark_filler_length_in_front = 2 

801 self.diff_mark_length = 1 

802 self.diff_mark_filler_length_in_rear = 2 

803 self.rhs_max_row_number_length = rhs_max_row_number_length 

804 self.rhs_filler_length = 1 

805 self.rhs_max_row_length = rhs_max_row_length 

806 self.prefix_length_for_diff_columns_displays = len(self.PREFIX_of_DIFF_COLUMNS) 

807 

808 self.lhs_length = self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length 

809 self.diff_mark_length = self.diff_mark_filler_length_in_front + self.diff_mark_length + self.diff_mark_filler_length_in_rear 

810 self.rhs_length = self.rhs_max_row_number_length + self.rhs_filler_length + self.rhs_max_row_length 

811 

812 

813 # --- heading-related description --- 

814 

815 def division_string(self): 

816 return '-' * (self.lhs_length + self.diff_mark_length + self.rhs_length + self.prefix_length_for_diff_columns_displays + len(self.DIFFERENT_COLUMN_GUIDE)) 

817 

818 def file_name_description(self, lhs_file_name, rhs_file_name): 

819 

820 lhs_file_name = UnicodeSupport.left_justified(lhs_file_name, self.lhs_length) 

821 diff_mark_spacing = ' ' * self.diff_mark_length 

822 rhs_file_name = UnicodeSupport.left_justified(rhs_file_name, self.rhs_length) 

823 prefix_length_spacing = ' ' * self.prefix_length_for_diff_columns_displays 

824 return f'{lhs_file_name}{diff_mark_spacing}{rhs_file_name}{prefix_length_spacing}{self.DIFFERENT_COLUMN_GUIDE}' 

825 

826 

827 # --- left-hand side related description --- 

828 

829 def lhs_only_description(self, lhs_fact): 

830 

831 lhs = self._lhs_description(lhs_fact) 

832 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.LHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear) 

833 return f'{lhs}{diff_mark_area}' 

834 

835 def _lhs_description(self, lhs_fact): 

836 

837 lhs_row_number = UnicodeSupport.right_justified(str(lhs_fact.lhs_row_number), self.lhs_max_row_number_length) 

838 spacing = ' ' * self.lhs_filler_length 

839 lhs_row = UnicodeSupport.left_justified(str(lhs_fact.lhs_row), self.lhs_max_row_length) 

840 return f'{lhs_row_number}{spacing}{lhs_row}' 

841 

842 def _lhs_empty_description(self): 

843 return ' ' * (self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length) 

844 

845 

846 # --- right-hand side related description --- 

847 

848 def rhs_only_description(self, rhs_fact): 

849 

850 empty_lhs = self._lhs_empty_description() 

851 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.RHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear) 

852 rhs = self._rhs_description(rhs_fact) 

853 return f'{empty_lhs}{diff_mark_area}{rhs}' 

854 

855 def _rhs_description(self, rhs_fact): 

856 

857 rhs_row_number = UnicodeSupport.right_justified(str(rhs_fact.rhs_row_number), self.rhs_max_row_number_length) 

858 spacing = ' ' * self.rhs_filler_length 

859 rhs_row = UnicodeSupport.left_justified(str(rhs_fact.rhs_row), self.rhs_max_row_length) 

860 return f'{rhs_row_number}{spacing}{rhs_row}' 

861 

862 

863 # --- both sides related description --- 

864 

865 def both_description(self, lhs_fact, rhs_fact, value_difference_result): 

866 

867 lhs = self._lhs_description(lhs_fact) 

868 diff_mark = Mark.HAS_DIFF if value_difference_result.has_difference else Mark.NON_DIFF 

869 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + diff_mark + (' ' * self.diff_mark_filler_length_in_rear) 

870 rhs = self._rhs_description(rhs_fact) 

871 prefix_of_diff_columns = self.PREFIX_of_DIFF_COLUMNS if value_difference_result.has_difference else '' 

872 different_columns = str(value_difference_result.different_column_indices) if value_difference_result.has_difference else '' 

873 return f'{lhs}{diff_mark_area}{rhs}{prefix_of_diff_columns}{different_columns}' 

874 

875 

876 def __init__(self, context, scan_result): 

877 

878 super(HorizontalReporter, self).__init__(context) 

879 self.cxt = context 

880 

881 if context.needs_size_info_for_padding: 

882 size_info = scan_result.size_info_for_padding 

883 self.template = HorizontalReporter.Template(len(str(size_info.lhs_max_row_number)), 

884 size_info.lhs_max_row_length, 

885 len(str(size_info.rhs_max_row_number)), 

886 size_info.rhs_max_row_length) 

887 else: 

888 self.template = None 

889 

890 

891 # --- report heading related --- 

892 

893 def _report_file_name(self): 

894 

895 print(self.template.division_string()) 

896 print(self.template.file_name_description(os.path.basename(self.cxt.lhs_file_name), os.path.basename(self.cxt.rhs_file_name))) 

897 print(self.template.division_string()) 

898 

899 

900 # --- report each cases --- 

901 

902 def report_case_of_existed_only_on_lhs(self, lhs_fact): 

903 

904 if self.cxt.shows_details: 

905 print(self.template.lhs_only_description(lhs_fact)) 

906 

907 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

908 

909 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines: 

910 print(self.template.both_description(lhs_fact, rhs_fact, value_difference_result)) 

911 

912 def report_case_of_existed_only_on_rhs(self, rhs_fact): 

913 

914 if self.cxt.shows_details: 

915 print(self.template.rhs_only_description(rhs_fact)) 

916 

917 

918class VerticalReporter(DetailReporter): 

919 

920 class Template: 

921 

922 LHS_MARK = 'L' 

923 RHS_MARK = 'R' 

924 PREFIX_of_DIFF_COLUMNS = '@' 

925 

926 def __init__(self): 

927 pass 

928 

929 

930 # --- heading-related description --- 

931 

932 @classmethod 

933 def division_string(cls): 

934 return '-' * 80 

935 

936 @classmethod 

937 def file_name_description(cls, mark, file_name): 

938 return f'{mark} {file_name}' 

939 

940 

941 # --- left-hand side related description --- 

942 

943 @classmethod 

944 def lhs_only_description(cls, lhs_fact): 

945 return f'{Mark.LHS_ONLY} {cls.LHS_MARK} {str(lhs_fact.lhs_row_number)} {str(lhs_fact.lhs_row)}' 

946 

947 

948 # --- right-hand side related description --- 

949 

950 @classmethod 

951 def rhs_only_description(cls, rhs_fact): 

952 return f'{Mark.RHS_ONLY} {cls.RHS_MARK} {str(rhs_fact.rhs_row_number)} {str(rhs_fact.rhs_row)}' 

953 

954 

955 # --- both sides related description --- 

956 

957 @classmethod 

958 def both_description_heading(cls, value_difference_result): 

959 

960 if value_difference_result.has_difference: 

961 return f'{Mark.HAS_DIFF} {cls.PREFIX_of_DIFF_COLUMNS} {str(value_difference_result.different_column_indices)}' 

962 else: 

963 return Mark.NON_DIFF_EXPRESSLY 

964 

965 @classmethod 

966 def both_description_lhs(cls, lhs_fact, row_number_length): 

967 return f' {cls.LHS_MARK} {str(lhs_fact.lhs_row_number).rjust(row_number_length)} {str(lhs_fact.lhs_row)}' 

968 

969 @classmethod 

970 def both_description_rhs(cls, rhs_fact, row_number_length): 

971 return f' {cls.RHS_MARK} {str(rhs_fact.rhs_row_number).rjust(row_number_length)} {str(rhs_fact.rhs_row)}' 

972 

973 

974 

975 def __init__(self, context, _): 

976 

977 super(VerticalReporter, self).__init__(context) 

978 self.cxt = context 

979 self.template = VerticalReporter.Template() 

980 

981 

982 # --- report heading related --- 

983 

984 def _report_file_name(self): 

985 

986 print(self.template.division_string()) 

987 print(self.template.file_name_description(self.template.LHS_MARK, os.path.basename(self.cxt.lhs_file_name))) 

988 print(self.template.file_name_description(self.template.RHS_MARK, os.path.basename(self.cxt.rhs_file_name))) 

989 print(self.template.division_string()) 

990 

991 

992 # --- report each cases --- 

993 

994 def report_case_of_existed_only_on_lhs(self, lhs_fact): 

995 

996 if self.cxt.shows_details: 

997 print(self.template.lhs_only_description(lhs_fact)) 

998 

999 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

1000 

1001 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines: 

1002 

1003 row_number_length = max(len(str(lhs_fact.lhs_row_number)), len(str(rhs_fact.rhs_row_number))) 

1004 

1005 print(self.template.both_description_heading(value_difference_result)) 

1006 print(self.template.both_description_lhs(lhs_fact, row_number_length)) 

1007 print(self.template.both_description_rhs(rhs_fact, row_number_length)) 

1008 

1009 def report_case_of_existed_only_on_rhs(self, rhs_fact): 

1010 

1011 if self.cxt.shows_details: 

1012 print(self.template.rhs_only_description(rhs_fact)) 

1013 

1014 

1015class CountReporter: 

1016 

1017 class Counter: 

1018 

1019 def __init__(self): 

1020 

1021 self.number_of_same_lines = 0 

1022 self.number_of_lhs_only = 0 

1023 self.number_of_rhs_only = 0 

1024 self.number_of_differences = 0 

1025 

1026 self.row_numbers_for_lhs_only = [] 

1027 self.row_numbers_for_rhs_only = [] 

1028 self.row_numbers_for_differences = {} 

1029 

1030 self._max_digit = None 

1031 

1032 def _increment_same_lines(self): 

1033 self.number_of_same_lines += 1 

1034 

1035 def _increment_lhs_only(self): 

1036 self.number_of_lhs_only += 1 

1037 

1038 def _increment_rhs_only(self): 

1039 self.number_of_rhs_only += 1 

1040 

1041 def _increment_differences(self): 

1042 self.number_of_differences += 1 

1043 

1044 def _add_row_number_for_lhs_only(self, row_number): 

1045 self.row_numbers_for_lhs_only.append(row_number) 

1046 

1047 def _add_row_number_for_rhs_only(self, row_number): 

1048 self.row_numbers_for_rhs_only.append(row_number) 

1049 

1050 def _add_row_number_for_differences(self, lhs_row_number, rhs_row_number): 

1051 self.row_numbers_for_differences[lhs_row_number] = rhs_row_number 

1052 

1053 

1054 def count_for_case_of_existed_only_on_lhs(self, row_number): 

1055 self._increment_lhs_only() 

1056 self._add_row_number_for_lhs_only(row_number) 

1057 

1058 def count_for_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

1059 

1060 if value_difference_result.has_difference: 

1061 self._increment_differences() 

1062 self._add_row_number_for_differences(lhs_fact.lhs_row_number, rhs_fact.rhs_row_number) 

1063 else: 

1064 self._increment_same_lines() 

1065 

1066 def count_for_case_of_existed_only_on_rhs(self, row_number): 

1067 self._increment_rhs_only() 

1068 self._add_row_number_for_rhs_only(row_number) 

1069 

1070 @property 

1071 def sorted_row_numbers_for_differences(self): 

1072 return sorted(self.row_numbers_for_differences.items(), key=lambda x: x[0]) 

1073 

1074 

1075 @property 

1076 def max_digit(self): 

1077 

1078 if self._max_digit is not None: 

1079 return self._max_digit 

1080 

1081 self._max_digit = max( 

1082 len(str(self.number_of_same_lines)), 

1083 len(str(self.number_of_lhs_only)), 

1084 len(str(self.number_of_rhs_only)), 

1085 len(str(self.number_of_differences)), 

1086 ) 

1087 return self._max_digit 

1088 

1089 

1090 def __init__(self, shows_count): 

1091 self.shows_count = shows_count 

1092 self.counter = self.Counter() 

1093 

1094 

1095 def _func_of_right_justified_number(self): 

1096 return lambda number: str(number).rjust(self.counter.max_digit) 

1097 

1098 @spacing_before(1) 

1099 def report_count(self): 

1100 

1101 if not self.shows_count: 

1102 return 

1103 

1104 print('● Count & Row number') 

1105 

1106 rjust = self._func_of_right_justified_number() 

1107 print('same lines : {}'.format(rjust(self.counter.number_of_same_lines))) 

1108 print('left side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.LHS_ONLY, rjust(self.counter.number_of_lhs_only), self.counter.row_numbers_for_lhs_only)) 

1109 print('right side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.RHS_ONLY, rjust(self.counter.number_of_rhs_only), self.counter.row_numbers_for_rhs_only)) 

1110 print('with differences ({}): {} :-- Row Number Pairs -->: {}'.format(Mark.HAS_DIFF, rjust(self.counter.number_of_differences), self.counter.sorted_row_numbers_for_differences)) 

1111 

1112 

1113class UnicodeSupport: 

1114 

1115 @classmethod 

1116 def left_justified(cls, value, length): 

1117 return f"{value}{' ' * (length - cls.string_length_considering_east_asian_characters_of(value))}" 

1118 

1119 @classmethod 

1120 def right_justified(cls, value, length): 

1121 return f"{' ' * (length - cls.string_length_considering_east_asian_characters_of(value))}{value}" 

1122 

1123 @staticmethod 

1124 def string_length_considering_east_asian_characters_of(text): 

1125 return functools.reduce(lambda counting, c: counting + (2 if unicodedata.east_asian_width(c) in 'FWA' else 1), 

1126 text, 0) 

1127 

1128 

1129# ---------------------------------------------------------------------------------------------------------------------- 

1130# CSV Reading 

1131# ---------------------------------------------------------------------------------------------------------------------- 

1132 

1133class FileArrangement(type): 

1134 

1135 LHS = '_for_lhs' 

1136 RHS = '_for_rhs' 

1137 

1138 

1139class CsvDialectFixer: 

1140 

1141 def __init__(self): 

1142 pass 

1143 

1144 @classmethod 

1145 def fixed_dialect(cls, context, csv_file, file_arrangement): 

1146 

1147 if context.forces_individual_specs: 

1148 return cls._dialect_from_context(context, file_arrangement) 

1149 else: 

1150 return cls._try_sniffing(context, csv_file, file_arrangement) 

1151 

1152 

1153 @classmethod 

1154 def _dialect_from_context(cls, context, file_arrangement): 

1155 

1156 dialect = csv.excel() 

1157 dialect.delimiter = getattr(context, "column_separator" + file_arrangement) 

1158 dialect.lineterminator = getattr(context, "line_separator" + file_arrangement) 

1159 dialect.quotechar = getattr(context, "quote_char" + file_arrangement) 

1160 dialect.skipinitialspace = getattr(context, "skips_space_after_column_separator" + file_arrangement) 

1161 

1162 return dialect, context 

1163 

1164 @classmethod 

1165 def _try_sniffing(cls, context, csv_file, file_arrangement): 

1166 

1167 try: 

1168 return cls._sniff(context, csv_file, file_arrangement) 

1169 

1170 except csv.Error as e: 

1171 

1172 logger.warning(f'Sniffing failed. Generated a dialect from context instead. [type={type(e)}, args={str(e.args)}, message={traceback.format_exception_only(type(e), e)}]') 

1173 return cls._dialect_from_context(context, file_arrangement) 

1174 

1175 finally: 

1176 csv_file.seek(0) 

1177 

1178 @classmethod 

1179 def _sniff(cls, context, csv_file, file_arrangement): 

1180 

1181 sample = csv_file.read(context.sniffing_size) 

1182 sniffer = csv.Sniffer() 

1183 dialect = sniffer.sniff(sample) 

1184 has_header = sniffer.has_header(sample) 

1185 

1186 adjusted_context = cls._adjust_context_with(dialect, has_header, context, file_arrangement) 

1187 

1188 return dialect, adjusted_context 

1189 

1190 @classmethod 

1191 def _adjust_context_with(cls, dialect, has_header, context, file_arrangement): 

1192 

1193 setattr(context, "column_separator" + file_arrangement, dialect.delimiter) 

1194 setattr(context, "line_separator" + file_arrangement, dialect.lineterminator) 

1195 setattr(context, "quote_char" + file_arrangement, dialect.quotechar) 

1196 setattr(context, "skips_space_after_column_separator" + file_arrangement, dialect.skipinitialspace) 

1197 context.first_row_is_header = has_header if context.header is None else (True if context.header == 'y' else False) 

1198 

1199 return context 

1200 

1201 

1202def show_dialect_for_debugging(dialect, context, message, file_arrangement): 

1203 

1204 logger.debug(f'---{message}---') 

1205 logger.debug(f'sniffing dialect={dialect}') 

1206 logger.debug(f'sniffing dialect csv.excel={isinstance(dialect, csv.excel)}') 

1207 logger.debug(f'sniffing dialect csv.excel_tab={isinstance(dialect, csv.excel_tab)}') 

1208 logger.debug(f'sniffing dialect csv.unix_dialect={isinstance(dialect, csv.unix_dialect)}') 

1209 logger.debug(f'sniffing dialect.delimiter={context.display_string_for_column_separator(dialect.delimiter)}') 

1210 logger.debug(f'sniffing dialect.doublequote={dialect.doublequote}') 

1211 logger.debug(f'sniffing dialect.escapechar={dialect.escapechar}') 

1212 logger.debug(f'sniffing dialect.lineterminator={context.display_string_for_line_separator(dialect.lineterminator, file_arrangement)}') 

1213 logger.debug(f'sniffing dialect.quotechar={dialect.quotechar}') 

1214 logger.debug(f'sniffing dialect.quoting={dialect.quoting}') 

1215 logger.debug(f'sniffing dialect.skipinitialspace={dialect.skipinitialspace}') 

1216 

1217 

1218 

1219class LhsFact: 

1220 

1221 def __init__(self, lhs_row_number, lhs_row, lhs_key): 

1222 

1223 logger.debug(f'LhsFact 生成 lhs_row_number={lhs_row_number}, lhs_row={lhs_row}, lhs_key={lhs_key}') 

1224 

1225 self.lhs_row_number = lhs_row_number 

1226 self.lhs_row = lhs_row 

1227 self.lhs_key = lhs_key 

1228 

1229 

1230class RhsFact: 

1231 

1232 def __init__(self, rhs_row_number, rhs_row, rhs_key): 

1233 

1234 logger.debug(f'RhsFact 生成 rhs_row_number={rhs_row_number}, rhs_row={rhs_row}, rhs_key={rhs_key}') 

1235 

1236 self.rhs_row_number = rhs_row_number 

1237 self.rhs_row = rhs_row 

1238 self.rhs_key = rhs_key 

1239 

1240 

1241class CsvReader: 

1242 

1243 class State: 

1244 

1245 def __init__(self, csv_file, dialect, file_name, first_row_is_header): 

1246 

1247 self._csv_file = csv_file 

1248 self._dialect = dialect 

1249 self._file_name = file_name 

1250 self._first_row_is_header = first_row_is_header 

1251 

1252 self._csv_reader = csv.reader(csv_file, dialect) 

1253 self._row_number = 0 

1254 self._previous_key = "" 

1255 

1256 def reset(self): 

1257 

1258 self._csv_file.seek(0) 

1259 self._csv_reader = csv.reader(self._csv_file, self._dialect) 

1260 self._row_number = 0 

1261 self._previous_key = "" 

1262 

1263 def increment_row_number(self): 

1264 

1265 if self._previous_key == MatchingKeyCodec.END_of_KEY: 

1266 return 

1267 

1268 self._row_number += 1 

1269 

1270 def key_changed(self, new_key): 

1271 

1272 if self._is_header(): 

1273 return 

1274 

1275 self._previous_key = new_key 

1276 

1277 def _is_header(self): 

1278 return self.row_number == 0 and self._first_row_is_header 

1279 

1280 @property 

1281 def csv_reader(self): 

1282 return self._csv_reader 

1283 

1284 @property 

1285 def file_name(self): 

1286 return self._file_name 

1287 

1288 @property 

1289 def row_number(self): 

1290 return self._row_number 

1291 

1292 @property 

1293 def previous_key(self): 

1294 return self._previous_key 

1295 

1296 

1297 def __init__(self, lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, context): 

1298 

1299 show_dialect_for_debugging(lhs_dialect, context, '左CSV', FileArrangement.LHS) 

1300 show_dialect_for_debugging(rhs_dialect, context, '右CSV', FileArrangement.RHS) 

1301 

1302 self.lhs_csv_state = CsvReader.State(lhs_csv, lhs_dialect, context.lhs_file_name, context.first_row_is_header) 

1303 self.rhs_csv_state = CsvReader.State(rhs_csv, rhs_dialect, context.rhs_file_name, context.first_row_is_header) 

1304 self.cxt = context 

1305 

1306 self.skip_header() 

1307 

1308 def skip_header(self): 

1309 

1310 if self.cxt.first_row_is_header: 

1311 _ = self.read_lhs() 

1312 _ = self.read_rhs() 

1313 

1314 def reset(self): 

1315 

1316 self.lhs_csv_state.reset() 

1317 self.rhs_csv_state.reset() 

1318 self.skip_header() 

1319 

1320 def read_lhs(self): 

1321 

1322 lhs_row, lhs_key = self._read_csv(self.lhs_csv_state) 

1323 self.lhs_csv_state.increment_row_number() 

1324 return LhsFact(self.lhs_csv_state.row_number, lhs_row, lhs_key) 

1325 

1326 def read_rhs(self): 

1327 

1328 rhs_row, rhs_key = self._read_csv(self.rhs_csv_state) 

1329 self.rhs_csv_state.increment_row_number() 

1330 return RhsFact(self.rhs_csv_state.row_number, rhs_row, rhs_key) 

1331 

1332 def _read_csv(self, csv_state): 

1333 

1334 try: 

1335 row = next(csv_state.csv_reader) 

1336 except StopIteration: 

1337 csv_state.key_changed(MatchingKeyCodec.END_of_KEY) 

1338 return [], MatchingKeyCodec.END_of_KEY 

1339 

1340 new_key = self.cxt.matching_key_codec.managed_key_for(row) 

1341 self._detect_key_violation(new_key, csv_state) 

1342 

1343 csv_state.key_changed(new_key) 

1344 

1345 return row, new_key 

1346 

1347 def _detect_key_violation(self, new_key, csv_state): 

1348 

1349 if csv_state.previous_key == '': 

1350 return 

1351 

1352 if new_key < csv_state.previous_key: 

1353 logger.error(f'matching keys in {csv_state.file_name} are not sorted.' 

1354 f' [current_key={MatchingKeyCodec.decode_key(new_key)}, previous_key={MatchingKeyCodec.decode_key(csv_state.previous_key)}, matching-key-indices={self.cxt.matching_key_codec.matching_key_info_list}]' 

1355 f' If the key is a number without zero padding, specify the max size of the key after colon like -k0:8.') 

1356 exit(1) 

1357 

1358 if self.cxt.key_should_be_unique and new_key == csv_state.previous_key: 

1359 logger.error(f'matching keys in {csv_state.file_name} are not unique.' 

1360 f' [current_key={MatchingKeyCodec.decode_key(new_key)}, previous_key={MatchingKeyCodec.decode_key(csv_state.previous_key)}, matching-key-indices={self.cxt.matching_key_codec.matching_key_info_list}]') 

1361 exit(1) 

1362 

1363 

1364if __name__ == '__main__': 

1365 

1366 main() 

1367 

1368 

1369