Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python2 

2# -*- coding: utf-8 -*- 

3import abc 

4import csv 

5import functools 

6import logging 

7import os 

8import sys 

9import timeit 

10from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 

11 

12 

13# ---------------------------------------------------------------------------------------------------------------------- 

14# Decorators 

15# ---------------------------------------------------------------------------------------------------------------------- 

16 

17def show_execution_time(): 

18 

19 def _execution_time(func): 

20 

21 def wrapper(*args, **kwargs): 

22 

23 start = timeit.default_timer() 

24 

25 func(*args, **kwargs) 

26 

27 elapsed_time = timeit.default_timer() - start 

28 print 

29 print("elapsed_time={0}".format(elapsed_time) + "[sec]") 

30 print 

31 

32 return wrapper 

33 

34 return _execution_time 

35 

36 

37def spacing_before(number_of_lines): 

38 

39 number_of_lines = number_of_lines or 1 

40 

41 def _spacing_before(func): 

42 

43 def wrapper(*args, **kwargs): 

44 

45 for i in range(number_of_lines): 

46 print('') 

47 

48 func(*args, **kwargs) 

49 

50 return wrapper 

51 

52 return _spacing_before 

53 

54 

55# ---------------------------------------------------------------------------------------------------------------------- 

56# Entrance 

57# ---------------------------------------------------------------------------------------------------------------------- 

58 

59# @show_execution_time() 

60def main(): 

61 

62 configure() 

63 

64 context = context_from_arguments() 

65 show_context_for_debugging(context) 

66 

67 try: 

68 run_in(context) 

69 except IndexError as e: 

70 logger.error('It is possible that the number of columns in the row is not aligned. Please check the csv data. If not, please file an issue. [{}, description={}]'.format(type(e), e)) 

71 sys.exit(1) 

72 

73 

74class App(type): 

75 

76 VERSION = '1.0.0' 

77 

78 

79class LoggingConfig(type): 

80 

81 # If you want to debug, play with the CONSOLE_LEVEL or FILE_LEVEL. 

82 

83 BASE_LEVEL = logging.DEBUG 

84 

85 CONSOLE_LEVEL = logging.ERROR 

86 CONSOLE_FORMAT = '%(levelname)s: %(message)s' 

87 

88 FILE_LEVEL = logging.WARNING 

89 FILE_FORMAT = '%(asctime)s: %(levelname)s: %(message)s' 

90 FILE_PATH = 'csvdiff.log' 

91 

92 

93logger = logging.getLogger(__name__) 

94 

95 

96def configure(): 

97 

98 logging.basicConfig(level=LoggingConfig.BASE_LEVEL) 

99 

100 stream_handler = logging.StreamHandler() 

101 stream_handler.setLevel(LoggingConfig.CONSOLE_LEVEL) 

102 stream_handler.setFormatter(logging.Formatter(LoggingConfig.CONSOLE_FORMAT)) 

103 

104 file_handler = logging.FileHandler(filename=LoggingConfig.FILE_PATH, mode='w') 

105 file_handler.setLevel(LoggingConfig.FILE_LEVEL) 

106 file_handler.setFormatter(logging.Formatter(LoggingConfig.FILE_FORMAT)) 

107 

108 logger.addHandler(stream_handler) 

109 logger.addHandler(file_handler) 

110 

111 logger.propagate = False 

112 

113 

114# ---------------------------------------------------------------------------------------------------------------------- 

115# Context Preparation 

116# ---------------------------------------------------------------------------------------------------------------------- 

117 

118def context_from_arguments(): 

119 

120 def arg_type_matching_key_in_csv(x): 

121 return map(MatchingKeyInfo, x.split(',')) 

122 

123 def arg_type_int_in_csv(x): 

124 return map(int, x.split(',')) 

125 

126 parser = ArgumentParser(prog='csv-diff-python2@blue-monk', formatter_class=ArgumentDefaultsHelpFormatter) 

127 

128 # Program name & Version ------------------------------------------------------------------------------------------- 

129 parser.add_argument('--version', action='version', version='%(prog)s {}'.format(App.VERSION)) 

130 

131 # Input CSV file paths --------------------------------------------------------------------------------------------- 

132 parser.add_argument('lhs_file_name', type=str, help='Absolute/Relative path to left-hand side file.') 

133 parser.add_argument('rhs_file_name', type=str, help='Absolute/Relative path to right-hand side file.') 

134 

135 # Matching conditions ---------------------------------------------------------------------------------------------- 

136 parser.add_argument('-k', '--matching-keys', type=arg_type_matching_key_in_csv, default='0', 

137 help='Matching key indices(from 0) for Input CSV in CSV format. For non-fixed length numbers, specify the number of digits after ":". e.g.: 0:8,3') 

138 parser.add_argument('-u', '--unique-key', default=False, action='store_true', 

139 help="Specify if the matching key is unique. Then, if it detects that the matching key is not unique, an error will occur.") 

140 parser.add_argument('-i', '--ignore-columns', type=arg_type_int_in_csv, default=[], 

141 help='Specify the index of the column to be ignored in CSV format. e.g.: 3,7') 

142 

143 # Report styles ---------------------------------------------------------------------------------------------------- 

144 parser.add_argument('-v', '--vertical-style', default=False, action='store_true', 

145 help='Report in vertical style. If not specified, report in horizontal(two facing) style.') 

146 

147 parser.add_argument('-c', '--show-count', default=False, action='store_true', 

148 help='Report the number of differences. Treat this as True if neither -d nor -a is specified.') 

149 

150 display_group = parser.add_mutually_exclusive_group() 

151 display_group.add_argument('-d', '--show-difference-only', default=False, action='store_true', 

152 help='Report the lines with the difference. Can be used with option -c. Cannot be used with option -a.') 

153 display_group.add_argument('-a', '--show-all-lines', action='store_true', 

154 help='Report on all lines. Can be used with option -c. Cannot be used with option -d.') 

155 

156 parser.add_argument('-x', '--show-context-from-arguments', default=False, action='store_true', 

157 help='Report the context generated from the arguments and CSV sniffing.') 

158 

159 # CSV analysis conditions ------------------------------------------------------------------------------------------ 

160 parser.add_argument('-H', '--header', type=str, default=None, choices=['n', 'y'], 

161 help='If specified, this specification will be enforced.') 

162 

163 parser.add_argument('-S', '--sniffing-size', type=str, default=4096, 

164 help="If csv sniffing fails, try specifying a size larger than 4096. Or Explicitly specify CSV file conditions like '--column-separator-for-lhs TAB'. Check help with -h option.") 

165 

166 parser.add_argument('-F', '--force-individual-specs', action='store_true', 

167 help="If you don't want to rely on csv sniffing, specify it, and then specify --column-separator and so on separately.") 

168 

169 parser.add_argument('--column-separator', type=str, default=None, choices=['COMMA', 'TAB', 'SEMICOLON'], 

170 help='Process both sides CSV file using the specified column delimiter.') 

171 

172 parser.add_argument('--line-separator', type=str, default=None, choices=['LF', 'CRLF'], 

173 help='Process both sides CSV file using the specified line separator.') 

174 

175 parser.add_argument('--quote-char', type=str, default=None, choices=['"', "'"], 

176 help='Process both sides CSV file using the specified quote character.') 

177 

178 parser.add_argument('--no-skip-space-after-column-separator', action='store_true', 

179 help='Specify when you want to treat the space immediately after the separator as data for the both sides CSV file.') 

180 

181 # CSV analysis conditions by left and right ------------------------------------------------------------------------ 

182 parser.add_argument('--column-separator-for-lhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'], 

183 help='Process left-hand side CSV file using the specified column delimiter.') 

184 

185 parser.add_argument('--column-separator-for-rhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'], 

186 help='Process right-hand side CSV file using the specified column delimiter.') 

187 

188 parser.add_argument('--line-separator-for-lhs', type=str, default="LF", choices=['LF', 'CRLF'], 

189 help='Process left-hand side CSV file using the specified line separator.') 

190 

191 parser.add_argument('--line-separator-for-rhs', type=str, default="LF", choices=['LF', 'CRLF'], 

192 help='Process right-hand side CSV file using the specified line separator.') 

193 

194 parser.add_argument('--quote-char-for-lhs', type=str, default='"', choices=['"', "'"], 

195 help='Process left-hand side CSV file using the specified quote character.') 

196 

197 parser.add_argument('--quote-char-for-rhs', type=str, default='"', choices=['"', "'"], 

198 help='Process right-hand side CSV file using the specified quote character.') 

199 

200 parser.add_argument('--no-skip-space-after-column-separator-for-lhs', default=False, action='store_true', 

201 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the left side.') 

202 

203 parser.add_argument('--no-skip-space-after-column-separator-for-rhs', default=False, action='store_true', 

204 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the right side.') 

205 

206 # ------------------------------------------------------------------------------------------------------------------ 

207 

208 return Context(parser.parse_args()) 

209 

210 

211class Context: 

212 

213 LINE_SEPARATOR_s = { 

214 "CR": '\r', 

215 "LF": '\n', 

216 "CRLF": '\r\n', 

217 None: '<None>', 

218 } 

219 

220 COLUMN_SEPARATOR_s = { 

221 "COMMA": ',', 

222 "TAB": '\t', 

223 "SEMICOLON": ';', 

224 None: '<None>', 

225 } 

226 

227 def __init__(self, args): 

228 

229 # Input CSV file paths --------------------------------------------------------------------------------------------- 

230 self.lhs_file_name = args.lhs_file_name 

231 self.rhs_file_name = args.rhs_file_name 

232 self.lhs_file_path = os.path.abspath(args.lhs_file_name) 

233 self.rhs_file_path = os.path.abspath(args.rhs_file_name) 

234 

235 # Matching conditions ---------------------------------------------------------------------------------------------- 

236 self.matching_key_codec = MatchingKeyCodec(args.matching_keys) 

237 self.key_should_be_unique = args.unique_key 

238 self.column_indices_to_ignore = args.ignore_columns 

239 

240 # Report styles ---------------------------------------------------------------------------------------------------- 

241 self.reports_in_vertical_style = args.vertical_style 

242 self.reports_in_horizontal_style = not args.vertical_style 

243 

244 self.shows_count = args.show_count 

245 self.shows_difference_only = args.show_difference_only 

246 self.shows_all_lines = args.show_all_lines 

247 self.shows_details = True if self.shows_difference_only or self.shows_all_lines else False 

248 self.shows_context_from_arguments = args.show_context_from_arguments 

249 

250 self.needs_size_info_for_padding = self.shows_details and self.reports_in_horizontal_style 

251 

252 # CSV analysis conditions ------------------------------------------------------------------------------------------ 

253 self.header = args.header 

254 self.first_row_is_header = None 

255 

256 self.sniffing_size = args.sniffing_size 

257 

258 self.forces_individual_specs = args.force_individual_specs 

259 

260 if self.forces_individual_specs and args.column_separator: 

261 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator] 

262 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator] 

263 else: 

264 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_lhs] 

265 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_rhs] 

266 

267 if self.forces_individual_specs and args.line_separator: 

268 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator] 

269 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator] 

270 else: 

271 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator_for_lhs] 

272 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator_for_rhs] 

273 

274 if self.forces_individual_specs and args.quote_char: 

275 self.quote_char_for_lhs = args.quote_char 

276 self.quote_char_for_rhs = args.quote_char 

277 else: 

278 self.quote_char_for_lhs = args.quote_char_for_lhs 

279 self.quote_char_for_rhs = args.quote_char_for_rhs 

280 

281 if self.forces_individual_specs and args.no_skip_space_after_column_separator: 

282 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator 

283 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator 

284 else: 

285 self.skips_space_after_column_separator_for_lhs = True 

286 self.skips_space_after_column_separator_for_rhs = True 

287 

288 self._validate() 

289 self._normalize() 

290 

291 def _validate(self): 

292 

293 if not os.path.exists(self.lhs_file_path): 

294 logger.error('lhs_file_path not exists. [lhs_file_path={}]'.format(self.lhs_file_path)) 

295 sys.exit(1) 

296 if not os.path.exists(self.rhs_file_path): 

297 logger.error('rhs_file_path not exists. [rhs_file_path={}]'.format(self.rhs_file_path)) 

298 sys.exit(1) 

299 

300 if not os.path.isfile(self.lhs_file_path): 

301 logger.error('lhs_file_path is not a file. [lhs_file_path={}]'.format(self.lhs_file_path)) 

302 sys.exit(1) 

303 if not os.path.isfile(self.rhs_file_path): 

304 logger.error('rhs_file_path is not a file. [rhs_file_path={}]'.format(self.rhs_file_path)) 

305 sys.exit(1) 

306 

307 def _normalize(self): 

308 

309 if not any([self.shows_count, self.shows_difference_only, self.shows_all_lines]): 

310 self.shows_count = True 

311 

312 def display_string_for_column_separator(self, value): 

313 

314 candidates = [k for k, v in self.COLUMN_SEPARATOR_s.items() if v == value] 

315 if candidates: 

316 return candidates[0] 

317 else: 

318 'undefined({})'.format(value) 

319 

320 

321def show_context_for_debugging(cxt): 

322 

323 logger.debug('lhs_file_name={}'.format(cxt.lhs_file_name)) 

324 logger.debug('rhs_file_name={}'.format(cxt.rhs_file_name)) 

325 logger.debug('lhs_file_path={}'.format(cxt.lhs_file_path)) 

326 logger.debug('rhs_file_path={}'.format(cxt.rhs_file_path)) 

327 

328 logger.debug('matching_key_codec={}'.format(cxt.matching_key_codec)) 

329 logger.debug('key_should_be_unique={}'.format(cxt.key_should_be_unique)) 

330 logger.debug('column_indices_to_ignore={}'.format(cxt.column_indices_to_ignore)) 

331 

332 logger.debug('reports_in_vertical_style={}'.format(cxt.reports_in_vertical_style)) 

333 logger.debug('reports_in_horizontal_style={}'.format(cxt.reports_in_horizontal_style)) 

334 logger.debug('shows_count={}'.format(cxt.shows_count)) 

335 logger.debug('shows_difference_only={}'.format(cxt.shows_difference_only)) 

336 logger.debug('shows_all_lines={}'.format(cxt.shows_all_lines)) 

337 logger.debug('shows_context_from_arguments={}'.format(cxt.shows_context_from_arguments)) 

338 logger.debug('needs_size_info_for_padding={}'.format(cxt.needs_size_info_for_padding)) 

339 

340 logger.debug('first_row_is_header={}'.format(cxt.first_row_is_header)) 

341 logger.debug('sniffing_size={}'.format(cxt.sniffing_size)) 

342 logger.debug('force_individual_specs={}'.format(cxt.forces_individual_specs)) 

343 

344 logger.debug('column_separator_for_lhs={}'.format(cxt.display_string_for_column_separator(cxt.column_separator_for_lhs))) 

345 logger.debug('column_separator_for_rhs={}'.format(cxt.display_string_for_column_separator(cxt.column_separator_for_rhs))) 

346 logger.debug('line_separator_for_lhs={}'.format(cxt.line_separator_for_lhs.encode('hex'))) 

347 logger.debug('line_separator_for_rhs={}'.format(cxt.line_separator_for_rhs.encode('hex'))) 

348 logger.debug('quote_char_for_lhs={}'.format(cxt.quote_char_for_lhs)) 

349 logger.debug('quote_char_for_rhs={}'.format(cxt.quote_char_for_rhs)) 

350 logger.debug('skips_space_after_column_separator_for_lhs={}'.format(cxt.skips_space_after_column_separator_for_lhs)) 

351 logger.debug('skips_space_after_column_separator_for_rhs={}'.format(cxt.skips_space_after_column_separator_for_rhs)) 

352 

353 logger.debug('MatchingKeyCodec#END_of_KEY={}'.format(MatchingKeyCodec.END_of_KEY)) 

354 

355 

356# ---------------------------------------------------------------------------------------------------------------------- 

357# Matching Key Treatment 

358# ---------------------------------------------------------------------------------------------------------------------- 

359 

360class MatchingKeyInfo: 

361 

362 def __init__(self, specified_string): 

363 

364 elements = filter(lambda x: x != '', specified_string.split(':')) 

365 

366 index = elements.pop(0) 

367 self.index = self._transform_into_numeric(index, 'index') 

368 

369 max_length = elements.pop(0) if elements else '0' 

370 self.max_length = self._transform_into_numeric(max_length, 'max_length') 

371 

372 def __repr__(self): 

373 return '{}({!r}, {!r})'.format(self.__class__.__name__, self.index, self.max_length if self.max_length > 0 else '<not specified>') 

374 

375 @classmethod 

376 def _transform_into_numeric(cls, value, name): 

377 

378 if not value.isdigit(): 

379 logger.error('MATCHING_KEY_INDICES should be a number. See also help. [specified {}={}]'.format(name, value)) 

380 exit(1) 

381 

382 return int(value) 

383 

384 def key_for(self, row): 

385 return row[self.index].rjust(self.max_length, '0') 

386 

387 

388class MatchingKeyCodec: 

389 

390 END_of_KEY = 'ZZZ' 

391 SEPARATOR = '..' 

392 

393 def __init__(self, matching_key_info_list): 

394 self.matching_key_info_list = matching_key_info_list 

395 

396 def __repr__(self): 

397 return '{}({!r})'.format(self.__class__.__name__, self.matching_key_info_list) 

398 

399 def managed_key_for(self, row): 

400 

401 try: 

402 return functools.reduce(lambda making, matching_key: making + matching_key.key_for(row) + self.SEPARATOR, 

403 self.matching_key_info_list, self.SEPARATOR) 

404 except IndexError: 

405 logger.error('one of the indices specified for MATCHING_KEY_INDICES is out of range [MATCHING_KEY_INDICES={}, number of columns = {}, row={}]'.format(self.matching_key_info_list, len(row), row)) 

406 exit(1) 

407 

408 @property 

409 def matching_key_indices(self): 

410 return map(lambda matching_key_info: matching_key_info.index, self.matching_key_info_list) 

411 

412 @classmethod 

413 def decode_key(cls, key): 

414 """ Leave the padding as it is. """ 

415 return key.strip(cls.SEPARATOR).split(cls.SEPARATOR) 

416 

417 

418# ---------------------------------------------------------------------------------------------------------------------- 

419# Control and Determine if it exists only on the left, only on the right, or both 

420# ---------------------------------------------------------------------------------------------------------------------- 

421 

422def run_in(context): 

423 

424 with open(context.lhs_file_path, mode='r') as lhs_csv,\ 

425 open(context.rhs_file_path, mode='r') as rhs_csv: 

426 

427 lhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(context, lhs_csv, FileArrangement.LHS) 

428 rhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(adjusted_context, rhs_csv, FileArrangement.RHS) 

429 

430 csv_reader = CsvReader(lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, adjusted_context) 

431 pre_scan_result = PreScanner.scan(adjusted_context, csv_reader) 

432 csv_reader.reset() 

433 

434 detect_diff(adjusted_context, csv_reader, pre_scan_result) 

435 

436 

437def detect_diff(context, csv_reader, pre_scan_result): 

438 

439 value_difference_detector = ValueDifferenceDetector(pre_scan_result.number_of_columns, 

440 context.matching_key_codec.matching_key_indices, 

441 context.column_indices_to_ignore) 

442 

443 heading_reporter = HeadingReporter(context) 

444 detail_reporter = DetailReporter.Factory.reporter_for(context, pre_scan_result) 

445 count_reporter = CountReporter(context.shows_count) 

446 counter = count_reporter.counter 

447 

448 heading_reporter.report_heading() 

449 detail_reporter.report_detail_heading() 

450 

451 

452 def existed_only_on_lhs(lhs_fact): 

453 counter.count_for_case_of_existed_only_on_lhs(lhs_fact.lhs_row_number) 

454 detail_reporter.report_case_of_existed_only_on_lhs(lhs_fact) 

455 

456 def existed_on_both_sides(lhs_fact, rhs_fact): 

457 value_difference_result = value_difference_detector.detect_difference_between(lhs_fact.lhs_row, rhs_fact.rhs_row) 

458 counter.count_for_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result) 

459 detail_reporter.report_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result) 

460 

461 def existed_only_on_rhs(rhs_fact): 

462 counter.count_for_case_of_existed_only_on_rhs(rhs_fact.rhs_row_number) 

463 detail_reporter.report_case_of_existed_only_on_rhs(rhs_fact) 

464 

465 perform_key_matching(csv_reader, existed_only_on_lhs, existed_on_both_sides, existed_only_on_rhs) 

466 

467 

468 count_reporter.report_count() 

469 

470 

471def perform_key_matching(csv_reader, callback_for_lhs_only, callback_for_both_sides, callback_for_rhs_only): 

472 

473 lhs_fact = csv_reader.read_lhs() 

474 rhs_fact = csv_reader.read_rhs() 

475 

476 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY or rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY: 

477 

478 if lhs_fact.lhs_key < rhs_fact.rhs_key: 

479 callback_for_lhs_only(lhs_fact) 

480 lhs_fact = csv_reader.read_lhs() 

481 

482 elif lhs_fact.lhs_key == rhs_fact.rhs_key: 

483 callback_for_both_sides(lhs_fact, rhs_fact) 

484 lhs_fact = csv_reader.read_lhs() 

485 rhs_fact = csv_reader.read_rhs() 

486 

487 elif lhs_fact.lhs_key > rhs_fact.rhs_key: 

488 callback_for_rhs_only(rhs_fact) 

489 rhs_fact = csv_reader.read_rhs() 

490 

491 

492# ---------------------------------------------------------------------------------------------------------------------- 

493# Value-Difference Detection 

494# ---------------------------------------------------------------------------------------------------------------------- 

495 

496class ValueDifferenceDetector: 

497 

498 class ValueDifferenceResult: 

499 

500 def __init__(self, different_column_indices): 

501 

502 self.different_column_indices = different_column_indices 

503 

504 @property 

505 def has_difference(self): 

506 return True if self.different_column_indices else False 

507 

508 

509 def __init__(self, number_of_columns, matching_key_indices, ignore_column_indices): 

510 

511 self.column_indices = range(0, number_of_columns) 

512 logger.debug('column_indices={}'.format(self.column_indices)) 

513 

514 self.target_column_indices = set(self.column_indices) - set(matching_key_indices) - set(ignore_column_indices) 

515 logger.debug('target_column_indices={}'.format(self.target_column_indices)) 

516 

517 def detect_difference_between(self, lhs_row, rhs_row): 

518 

519 different_column_indices = [index for index in self.target_column_indices if lhs_row[index] != rhs_row[index]] 

520 logger.debug('different_column_indices={}'.format(different_column_indices)) 

521 return self.ValueDifferenceResult(different_column_indices) 

522 

523 

524# ---------------------------------------------------------------------------------------------------------------------- 

525# Reporting 

526# ---------------------------------------------------------------------------------------------------------------------- 

527 

528class PreScanner: 

529 

530 class ScanResult: 

531 

532 def __init__(self, number_of_columns, size_info_for_padding): 

533 self.number_of_columns = number_of_columns 

534 self.size_info_for_padding = size_info_for_padding 

535 

536 @classmethod 

537 def for_lightly(cls, number_of_columns): 

538 return PreScanner.ScanResult(number_of_columns, None) 

539 

540 @classmethod 

541 def for_deeply(cls, number_of_columns, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length): 

542 size_info_for_padding = cls.SizeInfoForPadding(lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length) 

543 return PreScanner.ScanResult(number_of_columns, size_info_for_padding) 

544 

545 

546 class SizeInfoForPadding: 

547 

548 def __init__(self, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length): 

549 self.lhs_max_row_number = lhs_max_row_number 

550 self.lhs_max_row_length = lhs_max_row_length 

551 self.rhs_max_row_number = rhs_max_row_number 

552 self.rhs_max_row_length = rhs_max_row_length 

553 

554 

555 def __init__(self): 

556 pass 

557 

558 @classmethod 

559 def scan(cls, context, csv_reader): 

560 

561 if context.needs_size_info_for_padding: 

562 return PreScanner._scan_deeply(csv_reader) 

563 else: 

564 return PreScanner._scan_lightly(csv_reader) 

565 

566 

567 @classmethod 

568 def _scan_deeply(cls, csv_reader): 

569 """ 

570 Notes 

571 ----- 

572 Purpose of deep pre-scanning 

573 * Determine the number of columns for value difference detection 

574 * Get size information to format the horizontal report 

575 """ 

576 start_ = timeit.default_timer() 

577 

578 lhs_max_row_length, rhs_max_row_length = 0, 0 

579 

580 lhs_fact = csv_reader.read_lhs() 

581 rhs_fact = csv_reader.read_rhs() 

582 

583 number_of_columns = cls._determine_number_of_columns_from(lhs_fact, rhs_fact) 

584 

585 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY: 

586 lhs_max_row_length = max(lhs_max_row_length, len(str(lhs_fact.lhs_row))) 

587 lhs_fact = csv_reader.read_lhs() 

588 

589 while rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY: 

590 rhs_max_row_length = max(rhs_max_row_length, len(str(rhs_fact.rhs_row))) 

591 rhs_fact = csv_reader.read_rhs() 

592 

593 lhs_max_row_number = csv_reader.lhs_csv_state.row_number 

594 rhs_max_row_number = csv_reader.rhs_csv_state.row_number 

595 logger.debug('lhs_max_row_number={}'.format(lhs_max_row_number)) 

596 logger.debug('rhs_max_row_number={}'.format(rhs_max_row_number)) 

597 

598 elapsed_time_ = timeit.default_timer() - start_ 

599 logger.debug("PreScanner#scan() elapsed_time:{0}".format(elapsed_time_) + "[sec]") 

600 return PreScanner.ScanResult.for_deeply(number_of_columns, 

601 lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length) 

602 

603 @classmethod 

604 def _scan_lightly(cls, csv_reader): 

605 """ 

606 Notes 

607 ----- 

608 Purpose of light pre-scanning 

609 * Determine the number of columns for value difference detection 

610 

611 Vertical reports do not require size information for formatting. 

612 """ 

613 

614 lhs_fact = csv_reader.read_lhs() 

615 rhs_fact = csv_reader.read_rhs() 

616 

617 return PreScanner.ScanResult.for_lightly(cls._determine_number_of_columns_from(lhs_fact, rhs_fact)) 

618 

619 @classmethod 

620 def _determine_number_of_columns_from(cls, lhs_fact, rhs_fact): 

621 

622 number_of_columns = 0 

623 if lhs_fact.lhs_row: 

624 number_of_columns = len(lhs_fact.lhs_row) 

625 elif rhs_fact.rhs_row: 

626 number_of_columns = len(rhs_fact.rhs_row) 

627 

628 return number_of_columns 

629 

630 

631class Mark(type): 

632 

633 LHS_ONLY = '<' 

634 RHS_ONLY = '>' 

635 HAS_DIFF = '!' 

636 NON_DIFF = ' ' 

637 NON_DIFF_EXPRESSLY = '=' 

638 

639 

640class HeadingReporter: 

641 

642 def __init__(self, context): 

643 self.cxt = context 

644 

645 

646 def report_heading(self): 

647 

648 self._report_title() 

649 

650 if self.cxt.shows_context_from_arguments: 

651 self._report_context() 

652 

653 @classmethod 

654 @spacing_before(1) 

655 def _report_title(cls): 

656 print('============ Report ============') 

657 

658 @spacing_before(1) 

659 def _report_context(self): 

660 

661 print('* Context') 

662 print('File Path on the Left-Hand Side: {}'.format(self.cxt.lhs_file_path)) 

663 print('File Path on the Right-Hand Side : {}'.format(self.cxt.rhs_file_path)) 

664 print('Matching Key Indices: {}'.format(self.cxt.matching_key_codec.matching_key_info_list)) 

665 print('Matching Key Is Unique?: {}'.format(self.cxt.key_should_be_unique)) 

666 print('Column Indices to Ignore: {}'.format(self.cxt.column_indices_to_ignore)) 

667 print('with Header?: {}'.format(self.cxt.first_row_is_header)) 

668 print('Report Style: {}'.format('Vertical' if self.cxt.reports_in_vertical_style else 'Two facing (Horizontal)')) 

669 print('Show Count?: {}'.format(self.cxt.shows_count)) 

670 print('Show Difference Only?: {}'.format(self.cxt.shows_difference_only)) 

671 print('Show All?: {}'.format(self.cxt.shows_all_lines)) 

672 print('Show Context?: {}'.format(self.cxt.shows_context_from_arguments)) 

673 print('CSV Sniffing Size: {}'.format(self.cxt.sniffing_size)) 

674 print('--- csv analysis conditions ---') 

675 print('Forces Individual Specified Conditions?: {}'.format(self.cxt.forces_individual_specs)) 

676 print('column_separator_for_lhs: {}'.format(self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_lhs))) # DONE: タブのときの表示 

677 print('column_separator_for_rhs: {}'.format(self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_rhs))) # DONE: タブのときの表示 

678 print('line_separator_for_lhs: {}'.format(self.cxt.line_separator_for_lhs.encode('hex'))) 

679 print('line_separator_for_rhs: {}'.format(self.cxt.line_separator_for_rhs.encode('hex'))) 

680 print('quote_char_for_lhs: {}'.format(self.cxt.quote_char_for_lhs)) 

681 print('quote_char_for_rhs: {}'.format(self.cxt.quote_char_for_rhs)) 

682 print('skips_space_after_column_separator_for_lhs: {}'.format(self.cxt.skips_space_after_column_separator_for_lhs)) 

683 print('skips_space_after_column_separator_for_rhs: {}'.format(self.cxt.skips_space_after_column_separator_for_rhs)) 

684 

685 

686class DetailReporter: 

687 

688 __metaclass__ = abc.ABCMeta 

689 

690 def __init__(self, context): 

691 self.cxt = context 

692 

693 

694 def report_detail_heading(self): 

695 

696 if not self.cxt.shows_details: 

697 return 

698 

699 self._report_content_heading() 

700 self._report_file_name() 

701 

702 @spacing_before(1) 

703 def _report_content_heading(self): 

704 

705 if self.cxt.shows_difference_only: 

706 print('* Differences') 

707 elif self.cxt.shows_all_lines: 

708 print('* All') 

709 else: 

710 pass 

711 

712 @abc.abstractmethod 

713 def _report_file_name(self): 

714 raise NotImplementedError() 

715 

716 

717 @abc.abstractmethod 

718 def report_case_of_existed_only_on_lhs(self, lhs_fact): 

719 raise NotImplementedError() 

720 

721 @abc.abstractmethod 

722 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

723 raise NotImplementedError() 

724 

725 @abc.abstractmethod 

726 def report_case_of_existed_only_on_rhs(self, rhs_fact): 

727 raise NotImplementedError() 

728 

729 

730 class Factory: 

731 

732 def __init__(self): 

733 pass 

734 

735 @staticmethod 

736 def reporter_for(context, scan_result): 

737 

738 if context.reports_in_vertical_style: 

739 return VerticalReporter(context, scan_result) 

740 else: 

741 return HorizontalReporter(context, scan_result) 

742 

743 

744class HorizontalReporter(DetailReporter): 

745 

746 class Template: 

747 

748 DIFFERENT_COLUMN_GUIDE = 'Column indices with difference' 

749 PREFIX_of_DIFF_COLUMNS = ' @ ' 

750 

751 def __init__(self, lhs_max_row_number_length, lhs_max_row_length, rhs_max_row_number_length, rhs_max_row_length): 

752 

753 self.lhs_max_row_number_length = lhs_max_row_number_length 

754 self.lhs_filler_length = 1 

755 self.lhs_max_row_length = lhs_max_row_length 

756 self.diff_mark_filler_length_in_front = 2 

757 self.diff_mark_length = 1 

758 self.diff_mark_filler_length_in_rear = 2 

759 self.rhs_max_row_number_length = rhs_max_row_number_length 

760 self.rhs_filler_length = 1 

761 self.rhs_max_row_length = rhs_max_row_length 

762 self.prefix_length_for_diff_columns_displays = len(self.PREFIX_of_DIFF_COLUMNS) 

763 

764 self.lhs_length = self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length 

765 self.diff_mark_length = self.diff_mark_filler_length_in_front + self.diff_mark_length + self.diff_mark_filler_length_in_rear 

766 self.rhs_length = self.rhs_max_row_number_length + self.rhs_filler_length + self.rhs_max_row_length 

767 

768 

769 # --- heading-related description --- 

770 

771 def division_string(self): 

772 return '-' * (self.lhs_length + self.diff_mark_length + self.rhs_length + self.prefix_length_for_diff_columns_displays + len(self.DIFFERENT_COLUMN_GUIDE)) 

773 

774 def file_name_description(self, lhs_file_name, rhs_file_name): 

775 

776 lhs_file_name = lhs_file_name.ljust(self.lhs_length) 

777 diff_mark_spacing = ' ' * self.diff_mark_length 

778 rhs_file_name = rhs_file_name.ljust(self.rhs_length) 

779 prefix_length_spacing = ' ' * self.prefix_length_for_diff_columns_displays 

780 different_column_guide = self.DIFFERENT_COLUMN_GUIDE 

781 return '%(lhs_file_name)s%(diff_mark_spacing)s%(rhs_file_name)s%(prefix_length_spacing)s%(different_column_guide)s' % locals() 

782 

783 

784 # --- left-hand side related description --- 

785 

786 def lhs_only_description(self, lhs_fact): 

787 

788 lhs = self._lhs_description(lhs_fact) 

789 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.LHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear) 

790 return '%(lhs)s%(diff_mark_area)s' % locals() 

791 

792 def _lhs_description(self, lhs_fact): 

793 

794 lhs_row_number = str(lhs_fact.lhs_row_number).rjust(self.lhs_max_row_number_length) 

795 spacing = ' ' * self.lhs_filler_length 

796 lhs_row = str(lhs_fact.lhs_row).ljust(self.lhs_max_row_length) 

797 return '%(lhs_row_number)s%(spacing)s%(lhs_row)s' % locals() 

798 

799 def _lhs_empty_description(self): 

800 return ' ' * (self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length) 

801 

802 

803 # --- right-hand side related description --- 

804 

805 def rhs_only_description(self, rhs_fact): 

806 

807 empty_lhs = self._lhs_empty_description() 

808 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.RHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear) 

809 rhs = self._rhs_description(rhs_fact) 

810 return '%(empty_lhs)s%(diff_mark_area)s%(rhs)s' % locals() 

811 

812 def _rhs_description(self, rhs_fact): 

813 

814 rhs_row_number = str(rhs_fact.rhs_row_number).rjust(self.rhs_max_row_number_length) 

815 spacing = ' ' * self.rhs_filler_length 

816 rhs_row = str(rhs_fact.rhs_row).ljust(self.rhs_max_row_length) 

817 return '%(rhs_row_number)s%(spacing)s%(rhs_row)s' % locals() 

818 

819 

820 # --- both sides related description --- 

821 

822 def both_description(self, lhs_fact, rhs_fact, value_difference_result): 

823 

824 lhs = self._lhs_description(lhs_fact) 

825 diff_mark = Mark.HAS_DIFF if value_difference_result.has_difference else Mark.NON_DIFF 

826 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + diff_mark + (' ' * self.diff_mark_filler_length_in_rear) 

827 rhs = self._rhs_description(rhs_fact) 

828 prefix_of_diff_columns = self.PREFIX_of_DIFF_COLUMNS if value_difference_result.has_difference else '' 

829 different_columns = str(value_difference_result.different_column_indices) if value_difference_result.has_difference else '' 

830 return '%(lhs)s%(diff_mark_area)s%(rhs)s%(prefix_of_diff_columns)s%(different_columns)s' % locals() 

831 

832 

833 def __init__(self, context, scan_result): 

834 

835 super(HorizontalReporter, self).__init__(context) 

836 self.cxt = context 

837 

838 if context.needs_size_info_for_padding: 

839 size_info = scan_result.size_info_for_padding 

840 self.template = HorizontalReporter.Template(len(str(size_info.lhs_max_row_number)), 

841 size_info.lhs_max_row_length, 

842 len(str(size_info.rhs_max_row_number)), 

843 size_info.rhs_max_row_length) 

844 else: 

845 self.template = None 

846 

847 

848 # --- report heading related --- 

849 

850 def _report_file_name(self): 

851 

852 print(self.template.division_string()) 

853 print(self.template.file_name_description(os.path.basename(self.cxt.lhs_file_name), os.path.basename(self.cxt.rhs_file_name))) 

854 print(self.template.division_string()) 

855 

856 

857 # --- report each cases --- 

858 

859 def report_case_of_existed_only_on_lhs(self, lhs_fact): 

860 

861 if self.cxt.shows_details: 

862 print(self.template.lhs_only_description(lhs_fact)) 

863 

864 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

865 

866 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines: 

867 print(self.template.both_description(lhs_fact, rhs_fact, value_difference_result)) 

868 

869 def report_case_of_existed_only_on_rhs(self, rhs_fact): 

870 

871 if self.cxt.shows_details: 

872 print(self.template.rhs_only_description(rhs_fact)) 

873 

874 

875class VerticalReporter(DetailReporter): 

876 

877 class Template: 

878 

879 LHS_MARK = 'L' 

880 RHS_MARK = 'R' 

881 PREFIX_of_DIFF_COLUMNS = '@' 

882 

883 def __init__(self): 

884 pass 

885 

886 

887 # --- heading-related description --- 

888 

889 @classmethod 

890 def division_string(cls): 

891 return '-' * 80 

892 

893 @classmethod 

894 def file_name_description(cls, mark, file_name): 

895 return mark + ' ' + file_name 

896 

897 

898 # --- left-hand side related description --- 

899 

900 @classmethod 

901 def lhs_only_description(cls, lhs_fact): 

902 return Mark.LHS_ONLY + ' ' + cls.LHS_MARK + ' ' + str(lhs_fact.lhs_row_number) + ' ' + str(lhs_fact.lhs_row) 

903 

904 

905 # --- right-hand side related description --- 

906 

907 @classmethod 

908 def rhs_only_description(cls, rhs_fact): 

909 return Mark.RHS_ONLY + ' ' + cls.RHS_MARK + ' ' + str(rhs_fact.rhs_row_number) + ' ' + str(rhs_fact.rhs_row) 

910 

911 

912 # --- both sides related description --- 

913 

914 @classmethod 

915 def both_description_heading(cls, value_difference_result): 

916 

917 if value_difference_result.has_difference: 

918 return Mark.HAS_DIFF + ' ' + cls.PREFIX_of_DIFF_COLUMNS + ' ' + str(value_difference_result.different_column_indices) 

919 else: 

920 return Mark.NON_DIFF_EXPRESSLY 

921 

922 @classmethod 

923 def both_description_lhs(cls, lhs_fact, row_number_length): 

924 return ' ' + cls.LHS_MARK + ' ' + str(lhs_fact.lhs_row_number).rjust(row_number_length) + ' ' + str(lhs_fact.lhs_row) 

925 

926 @classmethod 

927 def both_description_rhs(cls, rhs_fact, row_number_length): 

928 return ' ' + cls.RHS_MARK + ' ' + str(rhs_fact.rhs_row_number).rjust(row_number_length) + ' ' + str(rhs_fact.rhs_row) 

929 

930 

931 

932 def __init__(self, context, _): 

933 

934 super(VerticalReporter, self).__init__(context) 

935 self.cxt = context 

936 self.template = VerticalReporter.Template() 

937 

938 

939 # --- report heading related --- 

940 

941 def _report_file_name(self): 

942 

943 print(self.template.division_string()) 

944 print(self.template.file_name_description(self.template.LHS_MARK, os.path.basename(self.cxt.lhs_file_name))) 

945 print(self.template.file_name_description(self.template.RHS_MARK, os.path.basename(self.cxt.rhs_file_name))) 

946 print(self.template.division_string()) 

947 

948 

949 # --- report each cases --- 

950 

951 def report_case_of_existed_only_on_lhs(self, lhs_fact): 

952 

953 if self.cxt.shows_details: 

954 print(self.template.lhs_only_description(lhs_fact)) 

955 

956 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

957 

958 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines: 

959 

960 row_number_length = max(len(str(lhs_fact.lhs_row_number)), len(str(rhs_fact.rhs_row_number))) 

961 

962 print(self.template.both_description_heading(value_difference_result)) 

963 print(self.template.both_description_lhs(lhs_fact, row_number_length)) 

964 print(self.template.both_description_rhs(rhs_fact, row_number_length)) 

965 

966 def report_case_of_existed_only_on_rhs(self, rhs_fact): 

967 

968 if self.cxt.shows_details: 

969 print(self.template.rhs_only_description(rhs_fact)) 

970 

971 

972class CountReporter: 

973 

974 class Counter: 

975 

976 def __init__(self): 

977 

978 self.number_of_same_lines = 0 

979 self.number_of_lhs_only = 0 

980 self.number_of_rhs_only = 0 

981 self.number_of_differences = 0 

982 

983 self.row_numbers_for_lhs_only = [] 

984 self.row_numbers_for_rhs_only = [] 

985 self.row_numbers_for_differences = {} 

986 

987 self._max_digit = None 

988 

989 def _increment_same_lines(self): 

990 self.number_of_same_lines += 1 

991 

992 def _increment_lhs_only(self): 

993 self.number_of_lhs_only += 1 

994 

995 def _increment_rhs_only(self): 

996 self.number_of_rhs_only += 1 

997 

998 def _increment_differences(self): 

999 self.number_of_differences += 1 

1000 

1001 def _add_row_number_for_lhs_only(self, row_number): 

1002 self.row_numbers_for_lhs_only.append(row_number) 

1003 

1004 def _add_row_number_for_rhs_only(self, row_number): 

1005 self.row_numbers_for_rhs_only.append(row_number) 

1006 

1007 def _add_row_number_for_differences(self, lhs_row_number, rhs_row_number): 

1008 self.row_numbers_for_differences[lhs_row_number] = rhs_row_number 

1009 

1010 

1011 def count_for_case_of_existed_only_on_lhs(self, row_number): 

1012 self._increment_lhs_only() 

1013 self._add_row_number_for_lhs_only(row_number) 

1014 

1015 def count_for_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result): 

1016 

1017 if value_difference_result.has_difference: 

1018 self._increment_differences() 

1019 self._add_row_number_for_differences(lhs_fact.lhs_row_number, rhs_fact.rhs_row_number) 

1020 else: 

1021 self._increment_same_lines() 

1022 

1023 def count_for_case_of_existed_only_on_rhs(self, row_number): 

1024 self._increment_rhs_only() 

1025 self._add_row_number_for_rhs_only(row_number) 

1026 

1027 @property 

1028 def sorted_row_numbers_for_differences(self): 

1029 return sorted(self.row_numbers_for_differences.items(), key=lambda x: x[0]) 

1030 

1031 

1032 @property 

1033 def max_digit(self): 

1034 

1035 if self._max_digit is not None: 

1036 return self._max_digit 

1037 

1038 self._max_digit = max( 

1039 len(str(self.number_of_same_lines)), 

1040 len(str(self.number_of_lhs_only)), 

1041 len(str(self.number_of_rhs_only)), 

1042 len(str(self.number_of_differences)), 

1043 ) 

1044 return self._max_digit 

1045 

1046 

1047 def __init__(self, shows_count): 

1048 self.shows_count = shows_count 

1049 self.counter = self.Counter() 

1050 

1051 

1052 def _func_of_right_justified_number(self): 

1053 return lambda number: str(number).rjust(self.counter.max_digit) 

1054 

1055 @spacing_before(1) 

1056 def report_count(self): 

1057 

1058 if not self.shows_count: 

1059 return 

1060 

1061 print('* Count & Row number') 

1062 

1063 rjust = self._func_of_right_justified_number() 

1064 print('same lines : {}'.format(rjust(self.counter.number_of_same_lines))) 

1065 print('left side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.LHS_ONLY, rjust(self.counter.number_of_lhs_only), self.counter.row_numbers_for_lhs_only)) 

1066 print('right side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.RHS_ONLY, rjust(self.counter.number_of_rhs_only), self.counter.row_numbers_for_rhs_only)) 

1067 print('with differences ({}): {} :-- Row Number Pairs -->: {}'.format(Mark.HAS_DIFF, rjust(self.counter.number_of_differences), self.counter.sorted_row_numbers_for_differences)) 

1068 

1069 

1070# ---------------------------------------------------------------------------------------------------------------------- 

1071# CSV Reading 

1072# ---------------------------------------------------------------------------------------------------------------------- 

1073 

1074class FileArrangement(type): 

1075 

1076 LHS = '_for_lhs' 

1077 RHS = '_for_rhs' 

1078 

1079 

1080class CsvDialectFixer: 

1081 

1082 def __init__(self): 

1083 pass 

1084 

1085 @classmethod 

1086 def fixed_dialect(cls, context, csv_file, file_arrangement): 

1087 

1088 if context.forces_individual_specs: 

1089 return cls._dialect_from_context(context, file_arrangement) 

1090 else: 

1091 return cls._try_sniffing(context, csv_file, file_arrangement) 

1092 

1093 

1094 @classmethod 

1095 def _dialect_from_context(cls, context, file_arrangement): 

1096 

1097 dialect = csv.excel() 

1098 dialect.delimiter = getattr(context, "column_separator" + file_arrangement) 

1099 dialect.lineterminator = getattr(context, "line_separator" + file_arrangement) 

1100 dialect.quotechar = getattr(context, "quote_char" + file_arrangement) 

1101 dialect.skipinitialspace = getattr(context, "skips_space_after_column_separator" + file_arrangement) 

1102 

1103 return dialect, context 

1104 

1105 @classmethod 

1106 def _try_sniffing(cls, context, csv_file, file_arrangement): 

1107 

1108 try: 

1109 return cls._sniff(context, csv_file, file_arrangement) 

1110 

1111 except csv.Error as e: 

1112 

1113 logger.warning('Sniffing failed. Generated a dialect from context instead. [type={}, args={}, message={}]'.format(type(e), str(e.args), e.message)) 

1114 return cls._dialect_from_context(context, file_arrangement) 

1115 

1116 finally: 

1117 csv_file.seek(0) 

1118 

1119 @classmethod 

1120 def _sniff(cls, context, csv_file, file_arrangement): 

1121 

1122 sample = csv_file.read(context.sniffing_size) 

1123 sniffer = csv.Sniffer() 

1124 dialect = sniffer.sniff(sample) 

1125 has_header = sniffer.has_header(sample) 

1126 

1127 adjusted_context = cls._adjust_context_with(dialect, has_header, context, file_arrangement) 

1128 

1129 return dialect, adjusted_context 

1130 

1131 @classmethod 

1132 def _adjust_context_with(cls, dialect, has_header, context, file_arrangement): 

1133 

1134 setattr(context, "column_separator" + file_arrangement, dialect.delimiter) 

1135 setattr(context, "line_separator" + file_arrangement, dialect.lineterminator) 

1136 setattr(context, "quote_char" + file_arrangement, dialect.quotechar) 

1137 setattr(context, "skips_space_after_column_separator" + file_arrangement, dialect.skipinitialspace) 

1138 context.first_row_is_header = has_header if context.header is None else (True if context.header == 'y' else False) 

1139 

1140 return context 

1141 

1142 

1143def debug_log_dialect(dialect, context, message): 

1144 

1145 logger.debug('---{}---'.format(message)) 

1146 logger.debug('sniffing dialect={}'.format(dialect)) 

1147 logger.debug('sniffing dialect csv.excel={}'.format(isinstance(dialect, csv.excel))) 

1148 logger.debug('sniffing dialect csv.excel_tab={}'.format(isinstance(dialect, csv.excel_tab))) 

1149 logger.debug('sniffing dialect.delimiter={}'.format(dialect.delimiter.encode('hex'))) 

1150 logger.debug('sniffing dialect.doublequote={}'.format(dialect.doublequote)) 

1151 logger.debug('sniffing dialect.escapechar={}'.format(dialect.escapechar)) 

1152 logger.debug('sniffing dialect.lineterminator={}'.format(dialect.lineterminator.encode('hex'))) 

1153 logger.debug('sniffing dialect.quotechar={}'.format(dialect.quotechar)) 

1154 logger.debug('sniffing dialect.quoting={}'.format(dialect.quoting)) 

1155 logger.debug('sniffing dialect.skipinitialspace={}'.format(dialect.skipinitialspace)) 

1156 

1157 

1158class LhsFact: 

1159 

1160 def __init__(self, lhs_row_number, lhs_row, lhs_key): 

1161 

1162 logger.debug('LhsFact 生成 lhs_row_number={}, lhs_row={}, lhs_key={}'.format(lhs_row_number, lhs_row, lhs_key)) 

1163 

1164 self.lhs_row_number = lhs_row_number 

1165 self.lhs_row = lhs_row 

1166 self.lhs_key = lhs_key 

1167 

1168 

1169class RhsFact: 

1170 

1171 def __init__(self, rhs_row_number, rhs_row, rhs_key): 

1172 

1173 logger.debug('RhsFact 生成 rhs_row_number={}, rhs_row={}, rhs_key={}'.format(rhs_row_number, rhs_row, rhs_key)) 

1174 

1175 self.rhs_row_number = rhs_row_number 

1176 self.rhs_row = rhs_row 

1177 self.rhs_key = rhs_key 

1178 

1179 

1180class CsvReader: 

1181 

1182 class State: 

1183 

1184 def __init__(self, csv_file, dialect, file_name, first_row_is_header): 

1185 

1186 self._csv_file = csv_file 

1187 self._dialect = dialect 

1188 self._file_name = file_name 

1189 self._first_row_is_header = first_row_is_header 

1190 

1191 self._csv_reader = csv.reader(csv_file, dialect) 

1192 self._row_number = 0 

1193 self._previous_key = "" 

1194 

1195 def reset(self): 

1196 

1197 self._csv_file.seek(0) 

1198 self._csv_reader = csv.reader(self._csv_file, self._dialect) 

1199 self._row_number = 0 

1200 self._previous_key = "" 

1201 

1202 def increment_row_number(self): 

1203 

1204 if self._previous_key == MatchingKeyCodec.END_of_KEY: 

1205 return 

1206 

1207 self._row_number += 1 

1208 

1209 def key_changed(self, new_key): 

1210 

1211 if self._is_header(): 

1212 return 

1213 

1214 self._previous_key = new_key 

1215 

1216 def _is_header(self): 

1217 return self.row_number == 0 and self._first_row_is_header 

1218 

1219 @property 

1220 def csv_reader(self): 

1221 return self._csv_reader 

1222 

1223 @property 

1224 def file_name(self): 

1225 return self._file_name 

1226 

1227 @property 

1228 def row_number(self): 

1229 return self._row_number 

1230 

1231 @property 

1232 def previous_key(self): 

1233 return self._previous_key 

1234 

1235 

1236 def __init__(self, lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, context): 

1237 

1238 debug_log_dialect(lhs_dialect, context, '左CSV') 

1239 debug_log_dialect(rhs_dialect, context, '右CSV') 

1240 

1241 self.lhs_csv_state = CsvReader.State(lhs_csv, lhs_dialect, context.lhs_file_name, context.first_row_is_header) 

1242 self.rhs_csv_state = CsvReader.State(rhs_csv, rhs_dialect, context.rhs_file_name, context.first_row_is_header) 

1243 self.cxt = context 

1244 

1245 self.skip_header() 

1246 

1247 def skip_header(self): 

1248 

1249 if self.cxt.first_row_is_header: 

1250 _ = self.read_lhs() 

1251 _ = self.read_rhs() 

1252 

1253 def reset(self): 

1254 

1255 self.lhs_csv_state.reset() 

1256 self.rhs_csv_state.reset() 

1257 self.skip_header() 

1258 

1259 def read_lhs(self): 

1260 

1261 lhs_row, lhs_key = self._read_csv(self.lhs_csv_state) 

1262 self.lhs_csv_state.increment_row_number() 

1263 return LhsFact(self.lhs_csv_state.row_number, lhs_row, lhs_key) 

1264 

1265 def read_rhs(self): 

1266 

1267 rhs_row, rhs_key = self._read_csv(self.rhs_csv_state) 

1268 self.rhs_csv_state.increment_row_number() 

1269 return RhsFact(self.rhs_csv_state.row_number, rhs_row, rhs_key) 

1270 

1271 def _read_csv(self, csv_state): 

1272 

1273 try: 

1274 row = next(csv_state.csv_reader) 

1275 except StopIteration: 

1276 csv_state.key_changed(MatchingKeyCodec.END_of_KEY) 

1277 return [], MatchingKeyCodec.END_of_KEY 

1278 

1279 new_key = self.cxt.matching_key_codec.managed_key_for(row) 

1280 self._detect_key_violation(new_key, csv_state) 

1281 

1282 csv_state.key_changed(new_key) 

1283 

1284 return row, new_key 

1285 

1286 def _detect_key_violation(self, new_key, csv_state): 

1287 

1288 if csv_state.previous_key == '': 

1289 return 

1290 

1291 if new_key < csv_state.previous_key: 

1292 logger.error('matching keys in {} are not sorted.' 

1293 ' [current_key={}, previous_key={}, matching-key-indices={}] If the key is a number without zero padding, specify the max size of the key after colon like -k0:8.'.format( 

1294 csv_state.file_name, MatchingKeyCodec.decode_key(new_key), MatchingKeyCodec.decode_key(csv_state.previous_key), self.cxt.matching_key_codec.matching_key_info_list)) 

1295 exit(1) 

1296 

1297 if self.cxt.key_should_be_unique and new_key == csv_state.previous_key: 

1298 logger.error('matching keys in {} are not unique.' 

1299 ' [current_key={}, previous_key={}, matching-key-indices={}]'.format( 

1300 csv_state.file_name, MatchingKeyCodec.decode_key(new_key), MatchingKeyCodec.decode_key(csv_state.previous_key), self.cxt.matching_key_codec.matching_key_info_list)) 

1301 exit(1) 

1302 

1303 

1304if __name__ == '__main__': 

1305 

1306 main() 

1307