Coverage for src/csvdiff3/csvdiff.py: 96%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1#!/usr/bin/env python3

2# -*- coding: utf-8 -*-

3import abc

4import binascii

5import csv

6import functools

7import logging

8import os

9import sys

10import time

11import traceback

12import unicodedata

13from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

14from logging import Logger

17# ----------------------------------------------------------------------------------------------------------------------

18# Decorators

19# ----------------------------------------------------------------------------------------------------------------------

21def show_execution_time():

23 def _execution_time(func):

25 def wrapper(*args, **kwargs):

27 start = time.perf_counter()

29 func(*args, **kwargs)

31 elapsed_time = time.perf_counter() - start

32 print()

33 print(f'elapsed_time={elapsed_time}[sec]')

34 print()

36 return wrapper

38 return _execution_time

41def spacing_before(number_of_lines):

43 number_of_lines = number_of_lines or 1

45 def _spacing_before(func):

47 def wrapper(*args, **kwargs):

49 for i in range(number_of_lines):

50 print('')

52 func(*args, **kwargs)

54 return wrapper

56 return _spacing_before

59# ----------------------------------------------------------------------------------------------------------------------

60# Entrance

61# ----------------------------------------------------------------------------------------------------------------------

63# @show_execution_time()

64def main():

66 configure()

68 context = context_from_arguments()

69 show_context_for_debugging(context)

71 try:

72 run_in(context)

73 except IndexError as e:

74 logger.error(f'It is possible that the number of columns in the row is not aligned. Please check the csv data. If not, please file an issue. [{type(e)}, description={e}]')

75 sys.exit(1)

78class App(type):

80 NAME = 'csv-diff-python3@blue-monk'

81 VERSION = '1.0.0'

84class LoggingConfig(type):

86 # For debug, play with the CONSOLE_LEVEL or FILE_LEVEL.

88 BASE_LEVEL = logging.DEBUG

90 CONSOLE_LEVEL = logging.ERROR

91 CONSOLE_FORMAT = '%(levelname)s: %(message)s'

93 FILE_LEVEL = logging.WARNING

94 FILE_FORMAT = '%(asctime)s: %(levelname)s: %(message)s'

95 FILE_PATH = 'csvdiff.log'

98logger: Logger = logging.getLogger(__name__)

100

101def configure():

102

103 logging.basicConfig(level=LoggingConfig.BASE_LEVEL)

104

105 stream_handler = logging.StreamHandler()

106 stream_handler.setLevel(LoggingConfig.CONSOLE_LEVEL)

107 stream_handler.setFormatter(logging.Formatter(LoggingConfig.CONSOLE_FORMAT))

108

109 file_handler = logging.FileHandler(filename=LoggingConfig.FILE_PATH, mode='w')

110 file_handler.setLevel(LoggingConfig.FILE_LEVEL)

111 file_handler.setFormatter(logging.Formatter(LoggingConfig.FILE_FORMAT))

112

113 logger.addHandler(stream_handler)

114 logger.addHandler(file_handler)

115

116 logger.propagate = False

117

118

119# ----------------------------------------------------------------------------------------------------------------------

120# Context Preparation

121# ----------------------------------------------------------------------------------------------------------------------

122

123def context_from_arguments():

124

125 def arg_type_matching_key_in_csv(x):

126 return list(map(MatchingKeyInfo, x.split(',')))

127

128 def arg_type_int_in_csv(x):

129 return list(map(int, x.split(',')))

130

131

132 parser = ArgumentParser(prog=App.NAME, formatter_class=ArgumentDefaultsHelpFormatter)

133

134 # Program name & Version -------------------------------------------------------------------------------------------

135 parser.add_argument('--version', action='version', version=f'%(prog)s {App.VERSION}')

136

137 # Input CSV file paths ---------------------------------------------------------------------------------------------

138 parser.add_argument('lhs_file_name', type=str, help='Absolute/Relative path to left-hand side file.')

139 parser.add_argument('rhs_file_name', type=str, help='Absolute/Relative path to right-hand side file.')

140

141 # Input CSV file encodings -----------------------------------------------------------------------------------------

142 parser.add_argument('-e', '--encoding', type=str, default=None,

143 help='Encoding of the CSV files. (refer public reference named "Standard encoding") e.g.: shift_jis')

144

145 parser.add_argument('--encoding-for-lhs', type=str, default='utf8',

146 help='Encoding of the CSV file on the left side. (refer public reference named "Standard encoding") e.g.: shift_jis')

147 parser.add_argument('--encoding-for-rhs', type=str, default='utf8',

148 help='Encoding of the CSV file on the right side. (refer public reference named "Standard encoding") e.g.: shift_jis')

149

150 # Matching conditions ----------------------------------------------------------------------------------------------

151 parser.add_argument('-k', '--matching-keys', type=arg_type_matching_key_in_csv, default='0',

152 help='Matching key indices(from 0) for Input CSV in CSV format. For non-fixed length numbers, specify the number of digits after ":". e.g.: 0:8,3')

153 parser.add_argument('-u', '--unique-key', default=False, action='store_true',

154 help="Specify if the matching key is unique. Then, if it detects that the matching key is not unique, an error will occur.")

155 parser.add_argument('-i', '--ignore-columns', type=arg_type_int_in_csv, default=[],

156 help='Specify the index of the column to be ignored in CSV format. e.g.: 3,7')

157

158 # Report styles ----------------------------------------------------------------------------------------------------

159 parser.add_argument('-v', '--vertical-style', default=False, action='store_true',

160 help='Report in vertical style. If not specified, report in horizontal(two facing) style.')

161

162 parser.add_argument('-c', '--show-count', default=False, action='store_true',

163 help='Report the number of differences. Treat this as True if neither -d nor -a is specified.')

164

165 display_group = parser.add_mutually_exclusive_group()

166 display_group.add_argument('-d', '--show-difference-only', default=False, action='store_true',

167 help='Report the lines with the difference. Can be used with option -c. Cannot be used with option -a.')

168 display_group.add_argument('-a', '--show-all-lines', action='store_true',

169 help='Report on all lines. Can be used with option -c. Cannot be used with option -d.')

170

171 parser.add_argument('-x', '--show-context-from-arguments', default=False, action='store_true',

172 help='Report the context generated from the arguments and CSV sniffing.')

173

174 # CSV analysis conditions ------------------------------------------------------------------------------------------

175 parser.add_argument('-H', '--header', type=str, default=None, choices=['n', 'y'],

176 help='If specified, this specification will be enforced.')

177

178 parser.add_argument('-S', '--sniffing-size', type=str, default=4096,

179 help="If csv sniffing fails, try specifying a size larger than 4096. Or Explicitly specify CSV file conditions like '--column-separator-for-lhs TAB'. Check help with -h option.")

180

181 parser.add_argument('-F', '--force-individual-specs', action='store_true',

182 help="If you don't want to rely on csv sniffing, specify it, and then specify --column-separator and so on separately.")

183

184 parser.add_argument('--column-separator', type=str, default=None, choices=['COMMA', 'TAB', 'SEMICOLON'],

185 help='Process both sides CSV file using the specified column delimiter.')

186

187 parser.add_argument('--line-separator', type=str, default=None, choices=['LF', 'CRLF'],

188 help='Process both sides CSV file using the specified line separator.')

189

190 parser.add_argument('--quote-char', type=str, default=None, choices=['"', "'"],

191 help='Process both sides CSV file using the specified quote character.')

192

193 parser.add_argument('--no-skip-space-after-column-separator', action='store_true',

194 help='Specify when you want to treat the space immediately after the separator as data for the both sides CSV file.')

195

196 # CSV analysis conditions by left and right ------------------------------------------------------------------------

197 parser.add_argument('--column-separator-for-lhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],

198 help='Process left-hand side CSV file using the specified column delimiter.')

199

200 parser.add_argument('--column-separator-for-rhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],

201 help='Process right-hand side CSV file using the specified column delimiter.')

202

203 parser.add_argument('--line-separator-for-lhs', type=str, default="LF", choices=['LF', 'CRLF'],

204 help='Process left-hand side CSV file using the specified line separator.')

205

206 parser.add_argument('--line-separator-for-rhs', type=str, default="LF", choices=['LF', 'CRLF'],

207 help='Process right-hand side CSV file using the specified line separator.')

208

209 parser.add_argument('--quote-char-for-lhs', type=str, default='"', choices=['"', "'"],

210 help='Process left-hand side CSV file using the specified quote character.')

211

212 parser.add_argument('--quote-char-for-rhs', type=str, default='"', choices=['"', "'"],

213 help='Process right-hand side CSV file using the specified quote character.')

214

215 parser.add_argument('--no-skip-space-after-column-separator-for-lhs', default=False, action='store_true',

216 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the left side.')

217

218 parser.add_argument('--no-skip-space-after-column-separator-for-rhs', default=False, action='store_true',

219 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the right side.')

220

221 # ------------------------------------------------------------------------------------------------------------------

222

223 return Context(parser.parse_args())

224

225

226class Context:

227

228 LINE_SEPARATOR_s = {

229 "CR": '\r',

230 "LF": '\n',

231 "CRLF": '\r\n',

232 None: '<None>',

233 }

234

235 COLUMN_SEPARATOR_s = {

236 "COMMA": ',',

237 "TAB": '\t',

238 "SEMICOLON": ';',

239 None: '<None>',

240 }

241

242 def __init__(self, args):

243

244 # Input CSV file paths -----------------------------------------------------------------------------------------

245 self.lhs_file_name = args.lhs_file_name

246 self.rhs_file_name = args.rhs_file_name

247 self.lhs_file_path = os.path.abspath(args.lhs_file_name)

248 self.rhs_file_path = os.path.abspath(args.rhs_file_name)

249

250 # Input CSV file encodings -------------------------------------------------------------------------------------

251 if args.encoding:

252 self.encoding_for_lhs = args.encoding

253 self.encoding_for_rhs = args.encoding

254 else:

255 self.encoding_for_lhs = args.encoding_for_lhs

256 self.encoding_for_rhs = args.encoding_for_rhs

257

258 # Matching conditions ------------------------------------------------------------------------------------------

259 self.matching_key_codec = MatchingKeyCodec(args.matching_keys)

260 self.key_should_be_unique = args.unique_key

261 self.column_indices_to_ignore = args.ignore_columns

262

263 # Report styles ------------------------------------------------------------------------------------------------

264 self.reports_in_vertical_style = args.vertical_style

265 self.reports_in_horizontal_style = not args.vertical_style

266

267 self.shows_count = args.show_count

268 self.shows_difference_only = args.show_difference_only

269 self.shows_all_lines = args.show_all_lines

270 self.shows_details = True if self.shows_difference_only or self.shows_all_lines else False

271 self.shows_context_from_arguments = args.show_context_from_arguments

272

273 self.needs_size_info_for_padding = self.shows_details and self.reports_in_horizontal_style

274

275 # CSV analysis conditions --------------------------------------------------------------------------------------

276 self.header = args.header

277 self.first_row_is_header = None

278

279 self.sniffing_size = args.sniffing_size

280

281 self.forces_individual_specs = args.force_individual_specs

282

283 if self.forces_individual_specs and args.column_separator:

284 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator]

285 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator]

286 else:

287 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_lhs]

288 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_rhs]

289

290 if self.forces_individual_specs and args.line_separator:

291 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator]

292 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator]

293 else:

294 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator_for_lhs]

295 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator_for_rhs]

296

297 if self.forces_individual_specs and args.quote_char:

298 self.quote_char_for_lhs = args.quote_char

299 self.quote_char_for_rhs = args.quote_char

300 else:

301 self.quote_char_for_lhs = args.quote_char_for_lhs

302 self.quote_char_for_rhs = args.quote_char_for_rhs

303

304 if self.forces_individual_specs and args.no_skip_space_after_column_separator:

305 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator

306 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator

307 else:

308 self.skips_space_after_column_separator_for_lhs = True

309 self.skips_space_after_column_separator_for_rhs = True

310

311 if self.forces_individual_specs and args.no_skip_space_after_column_separator:

312 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator

313 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator

314 else:

315 self.skips_space_after_column_separator_for_lhs = True

316 self.skips_space_after_column_separator_for_rhs = True

317

318

319 self._validate()

320 self._normalize()

321

322 def _validate(self):

323

324 if not os.path.exists(self.lhs_file_path):

325 logger.error(f'lhs_file_path not exists. [lhs_file_path={self.lhs_file_path}]')

326 sys.exit(1)

327 if not os.path.exists(self.rhs_file_path):

328 logger.error(f'rhs_file_path not exists. [rhs_file_path={self.rhs_file_path}]')

329 sys.exit(1)

330

331 if not os.path.isfile(self.lhs_file_path):

332 logger.error(f'lhs_file_path is not a file. [lhs_file_path={self.lhs_file_path}]')

333 sys.exit(1)

334 if not os.path.isfile(self.rhs_file_path):

335 logger.error(f'rhs_file_path is not a file. [rhs_file_path={self.rhs_file_path}]')

336 sys.exit(1)

337

338 def _normalize(self):

339

340 if not any([self.shows_count, self.shows_difference_only, self.shows_all_lines]):

341 self.shows_count = True

342

343 def display_string_for_column_separator(self, value):

344

345 candidates = [k for k, v in self.COLUMN_SEPARATOR_s.items() if v == value]

346 if candidates:

347 return candidates[0]

348 else:

349 f'undefined({value})'

350

351 def display_string_for_line_separator(self, value, file_arrangement):

352

353 encoding_value = getattr(self, "encoding" + file_arrangement)

354 return binascii.hexlify(value.encode(encoding_value)).decode()

355

356

357def show_context_for_debugging(cxt):

358

359 logger.debug(f'lhs_file_name={cxt.lhs_file_name}')

360 logger.debug(f'rhs_file_name={cxt.rhs_file_name}')

361 logger.debug(f'lhs_file_path={cxt.lhs_file_path}')

362 logger.debug(f'rhs_file_path={cxt.rhs_file_path}')

363

364 logger.debug(f'encoding_for_lhs={cxt.encoding_for_lhs}')

365 logger.debug(f'encoding_for_rhs={cxt.encoding_for_rhs}')

366

367 logger.debug(f'matching_key_codec={cxt.matching_key_codec}')

368 logger.debug(f'key_should_be_unique={cxt.key_should_be_unique}')

369 logger.debug(f'column_indices_to_ignore={cxt.column_indices_to_ignore}')

370

371 logger.debug(f'reports_in_vertical_style={cxt.reports_in_vertical_style}')

372 logger.debug(f'reports_in_horizontal_style={cxt.reports_in_horizontal_style}')

373 logger.debug(f'shows_count={cxt.shows_count}')

374 logger.debug(f'shows_difference_only={cxt.shows_difference_only}')

375 logger.debug(f'shows_all_lines={cxt.shows_all_lines}')

376 logger.debug(f'shows_context_from_arguments={cxt.shows_context_from_arguments}')

377 logger.debug(f'needs_size_info_for_padding={cxt.needs_size_info_for_padding}')

378

379 logger.debug(f'first_row_is_header={cxt.first_row_is_header}')

380 logger.debug(f'sniffing_size={cxt.sniffing_size}')

381 logger.debug(f'force_individual_specs={cxt.forces_individual_specs}')

382

383 logger.debug(f'column_separator_for_lhs={cxt.display_string_for_column_separator(cxt.column_separator_for_lhs)}')

384 logger.debug(f'column_separator_for_rhs={cxt.display_string_for_column_separator(cxt.column_separator_for_rhs)}')

385 logger.debug(f'line_separator_for_lhs={cxt.display_string_for_line_separator(cxt.line_separator_for_lhs, FileArrangement.LHS)}')

386 logger.debug(f'line_separator_for_rhs={cxt.display_string_for_line_separator(cxt.line_separator_for_rhs, FileArrangement.RHS)}')

387 logger.debug(f'quote_char_for_lhs={cxt.quote_char_for_lhs}')

388 logger.debug(f'quote_char_for_rhs={cxt.quote_char_for_rhs}')

389 logger.debug(f'skips_space_after_column_separator_for_lhs={cxt.skips_space_after_column_separator_for_lhs}')

390 logger.debug(f'skips_space_after_column_separator_for_rhs={cxt.skips_space_after_column_separator_for_rhs}')

391

392 logger.debug(f'MatchingKeyCodec#END_of_KEY={MatchingKeyCodec.END_of_KEY}')

393

394

395# ----------------------------------------------------------------------------------------------------------------------

396# Matching Key Treatment

397# ----------------------------------------------------------------------------------------------------------------------

398

399class MatchingKeyInfo:

400

401 def __init__(self, specified_string):

402

403 elements = list(filter(lambda x: x != '', specified_string.split(':')))

404

405 index = elements.pop(0)

406 self.index = self._transform_into_numeric(index, 'index')

407

408 max_length = elements.pop(0) if elements else '0'

409 self.max_length = self._transform_into_numeric(max_length, 'max_length')

410

411 def __repr__(self):

412 return f"{self.__class__.__name__}({self.index!r}, {(self.max_length if self.max_length > 0 else '<not specified>')!r})"

413

414 @classmethod

415 def _transform_into_numeric(cls, value, name):

416

417 if not value.isdigit():

418 logger.error(f'MATCHING_KEY_INDICES should be a number. See also help. [specified {name}={value}]')

419 exit(1)

420

421 return int(value)

422

423 def key_for(self, row):

424 return row[self.index].rjust(self.max_length, '0')

425

426

427class MatchingKeyCodec:

428

429 END_of_KEY = 'ZZZ'

430 SEPARATOR = '..'

431

432 def __init__(self, matching_key_info_list):

433 self.matching_key_info_list = matching_key_info_list

434

435 def __repr__(self):

436 return f'{self.__class__.__name__}({self.matching_key_info_list!r})'

437

438 def managed_key_for(self, row):

439

440 try:

441 return functools.reduce(lambda making, matching_key: making + matching_key.key_for(row) + self.SEPARATOR,

442 self.matching_key_info_list, self.SEPARATOR)

443 except IndexError:

444 logger.error(f'one of the indices specified for MATCHING_KEY_INDICES is out of range [MATCHING_KEY_INDICES={self.matching_key_info_list}, number of columns = {len(row)}, row={row}]')

445 exit(1)

446

447 @property

448 def matching_key_indices(self):

449 return list(map(lambda matching_key_info: matching_key_info.index, self.matching_key_info_list))

450

451 @classmethod

452 def decode_key(cls, key):

453 """ Leave the padding as it is. """

454 return key.strip(cls.SEPARATOR).split(cls.SEPARATOR)

455

456

457

458# ----------------------------------------------------------------------------------------------------------------------

459# Control and Determine if it exists only on the left, only on the right, or both

460# ----------------------------------------------------------------------------------------------------------------------

461

462def run_in(context):

463

464 with open(context.lhs_file_path, mode='r', encoding=context.encoding_for_lhs) as lhs_csv,\

465 open(context.rhs_file_path, mode='r', encoding=context.encoding_for_rhs) as rhs_csv:

466

467 lhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(context, lhs_csv, FileArrangement.LHS)

468 rhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(adjusted_context, rhs_csv, FileArrangement.RHS)

469

470 csv_reader = CsvReader(lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, adjusted_context)

471 pre_scan_result = PreScanner.scan(adjusted_context, csv_reader)

472 csv_reader.reset()

473

474 detect_diff(adjusted_context, csv_reader, pre_scan_result)

475

476

477def detect_diff(context, csv_reader, pre_scan_result):

478

479 value_difference_detector = ValueDifferenceDetector(pre_scan_result.number_of_columns,

480 context.matching_key_codec.matching_key_indices,

481 context.column_indices_to_ignore)

482

483 heading_reporter = HeadingReporter(context)

484 detail_reporter = DetailReporter.Factory.reporter_for(context, pre_scan_result)

485 count_reporter = CountReporter(context.shows_count)

486 counter = count_reporter.counter

487

488 heading_reporter.report_heading()

489 detail_reporter.report_detail_heading()

490

491

492 def existed_only_on_lhs(lhs_fact):

493 counter.count_for_case_of_existed_only_on_lhs(lhs_fact.lhs_row_number)

494 detail_reporter.report_case_of_existed_only_on_lhs(lhs_fact)

495

496 def existed_on_both_sides(lhs_fact, rhs_fact):

497 value_difference_result = value_difference_detector.detect_difference_between(lhs_fact.lhs_row, rhs_fact.rhs_row)

498 counter.count_for_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)

499 detail_reporter.report_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)

500

501 def existed_only_on_rhs(rhs_fact):

502 counter.count_for_case_of_existed_only_on_rhs(rhs_fact.rhs_row_number)

503 detail_reporter.report_case_of_existed_only_on_rhs(rhs_fact)

504

505 perform_key_matching(csv_reader, existed_only_on_lhs, existed_on_both_sides, existed_only_on_rhs)

506

507

508 count_reporter.report_count()

509

510

511def perform_key_matching(csv_reader, callback_for_lhs_only, callback_for_both_sides, callback_for_rhs_only):

512

513 lhs_fact = csv_reader.read_lhs()

514 rhs_fact = csv_reader.read_rhs()

515

516 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY or rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:

517

518 if lhs_fact.lhs_key < rhs_fact.rhs_key:

519 callback_for_lhs_only(lhs_fact)

520 lhs_fact = csv_reader.read_lhs()

521

522 elif lhs_fact.lhs_key == rhs_fact.rhs_key:

523 callback_for_both_sides(lhs_fact, rhs_fact)

524 lhs_fact = csv_reader.read_lhs()

525 rhs_fact = csv_reader.read_rhs()

526

527 elif lhs_fact.lhs_key > rhs_fact.rhs_key:

528 callback_for_rhs_only(rhs_fact)

529 rhs_fact = csv_reader.read_rhs()

530

531

532# ----------------------------------------------------------------------------------------------------------------------

533# Value-Difference Detection

534# ----------------------------------------------------------------------------------------------------------------------

535

536class ValueDifferenceDetector:

537

538 class ValueDifferenceResult:

539

540 def __init__(self, different_column_indices):

541

542 self.different_column_indices = different_column_indices

543

544 @property

545 def has_difference(self):

546 return True if self.different_column_indices else False

547

548

549 def __init__(self, number_of_columns, matching_key_indices, ignore_column_indices):

550

551 self.column_indices = range(0, number_of_columns)

552 logger.debug(f'column_indices={self.column_indices}')

553

554 self.target_column_indices = set(self.column_indices) - set(matching_key_indices) - set(ignore_column_indices)

555 logger.debug(f'target_column_indices={self.target_column_indices}')

556

557 def detect_difference_between(self, lhs_row, rhs_row):

558

559 different_column_indices = [index for index in self.target_column_indices if lhs_row[index] != rhs_row[index]]

560 logger.debug(f'different_column_indices={different_column_indices}')

561 return self.ValueDifferenceResult(different_column_indices)

562

563

564

565# ----------------------------------------------------------------------------------------------------------------------

566# Reporting

567# ----------------------------------------------------------------------------------------------------------------------

568

569class PreScanner:

570

571 class ScanResult:

572

573 def __init__(self, number_of_columns, size_info_for_padding):

574 self.number_of_columns = number_of_columns

575 self.size_info_for_padding = size_info_for_padding

576

577 @classmethod

578 def for_lightly(cls, number_of_columns):

579 return PreScanner.ScanResult(number_of_columns, None)

580

581 @classmethod

582 def for_deeply(cls, number_of_columns, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):

583 size_info_for_padding = cls.SizeInfoForPadding(lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)

584 return PreScanner.ScanResult(number_of_columns, size_info_for_padding)

585

586

587 class SizeInfoForPadding:

588

589 def __init__(self, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):

590 self.lhs_max_row_number = lhs_max_row_number

591 self.lhs_max_row_length = lhs_max_row_length

592 self.rhs_max_row_number = rhs_max_row_number

593 self.rhs_max_row_length = rhs_max_row_length

594

595

596 def __init__(self):

597 pass

598

599 @classmethod

600 def scan(cls, context, csv_reader):

601

602 if context.needs_size_info_for_padding:

603 return PreScanner._scan_deeply(csv_reader)

604 else:

605 return PreScanner._scan_lightly(csv_reader)

606

607

608 @classmethod

609 def _scan_deeply(cls, csv_reader):

610 """

611 Notes

612 -----

613 Purpose of deep pre-scanning

614 * Determine the number of columns for value difference detection

615 * Get size information to format the horizontal report

616 """

617

618 start_ = time.perf_counter()

619

620 lhs_max_row_length, rhs_max_row_length = 0, 0

621

622 lhs_fact = csv_reader.read_lhs()

623 rhs_fact = csv_reader.read_rhs()

624

625 number_of_columns = cls._determine_number_of_columns_from(lhs_fact, rhs_fact)

626

627 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY:

628 lhs_max_row_length = max(lhs_max_row_length, UnicodeSupport.string_length_considering_east_asian_characters_of(str(lhs_fact.lhs_row)))

629 lhs_fact = csv_reader.read_lhs()

630

631 while rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:

632 rhs_max_row_length = max(rhs_max_row_length, UnicodeSupport.string_length_considering_east_asian_characters_of(str(rhs_fact.rhs_row)))

633 rhs_fact = csv_reader.read_rhs()

634

635 lhs_max_row_number = csv_reader.lhs_csv_state.row_number

636 rhs_max_row_number = csv_reader.rhs_csv_state.row_number

637 logger.debug(f'lhs_max_row_number={lhs_max_row_number}')

638 logger.debug(f'rhs_max_row_number={rhs_max_row_number}')

639

640 elapsed_time_ = time.perf_counter() - start_

641 logger.debug(f'PreScanner#scan() elapsed_time:{elapsed_time_}[sec]')

642 return PreScanner.ScanResult.for_deeply(number_of_columns,

643 lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)

644

645 @classmethod

646 def _scan_lightly(cls, csv_reader):

647 """

648 Notes

649 -----

650 Purpose of light pre-scanning

651 * Determine the number of columns for value difference detection

652

653 Vertical reports do not require size information for formatting.

654 """

655

656 lhs_fact = csv_reader.read_lhs()

657 rhs_fact = csv_reader.read_rhs()

658

659 return PreScanner.ScanResult.for_lightly(cls._determine_number_of_columns_from(lhs_fact, rhs_fact))

660

661 @classmethod

662 def _determine_number_of_columns_from(cls, lhs_fact, rhs_fact):

663

664 number_of_columns = 0

665 if lhs_fact.lhs_row:

666 number_of_columns = len(lhs_fact.lhs_row)

667 elif rhs_fact.rhs_row:

668 number_of_columns = len(rhs_fact.rhs_row)

669

670 return number_of_columns

674class Mark(type):

676 LHS_ONLY = '<'

677 RHS_ONLY = '>'

678 HAS_DIFF = '!'

679 NON_DIFF = ' '

680 NON_DIFF_EXPRESSLY = '='

681

682

683class HeadingReporter:

684

685 def __init__(self, context):

686 self.cxt = context

687

688

689 def report_heading(self):

690

691 self._report_title()

692

693 if self.cxt.shows_context_from_arguments:

694 self._report_context()

695

696 @classmethod

697 @spacing_before(1)

698 def _report_title(cls):

699 print('============ Report ============')

700

701 @spacing_before(1)

702 def _report_context(self):

703

704 print('● Context')

705 print(f'File Path on the Left-Hand Side: {self.cxt.lhs_file_path}')

706 print(f'File Path on the Right-Hand Side : {self.cxt.rhs_file_path}')

707 print(f'Matching Key Indices: {self.cxt.matching_key_codec.matching_key_info_list}')

708 print(f'Matching Key Is Unique?: {self.cxt.key_should_be_unique}')

709 print(f'Column Indices to Ignore: {self.cxt.column_indices_to_ignore}')

710 print(f'with Header?: {self.cxt.first_row_is_header}')

711 print(f'Report Style: {"Vertical" if self.cxt.reports_in_vertical_style else "Two facing (Horizontal)"}')

712 print(f'Show Count?: {self.cxt.shows_count}')

713 print(f'Show Difference Only?: {self.cxt.shows_difference_only}')

714 print(f'Show All?: {self.cxt.shows_all_lines}')

715 print(f'Show Context?: {self.cxt.shows_context_from_arguments}')

716 print(f'File Encoding for Left-Hand Side: {self.cxt.encoding_for_lhs}')

717 print(f'File Encoding for Right-Hand Side: {self.cxt.encoding_for_rhs}')

718 print(f'CSV Sniffing Size: {self.cxt.sniffing_size}')

719 print('--- csv analysis conditions ---')

720 print(f'Forces Individual Specified Conditions?: {self.cxt.forces_individual_specs}')

721 print(f'column_separator_for_lhs: {self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_lhs)}')

722 print(f'column_separator_for_rhs: {self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_rhs)}')

723 print(f'line_separator_for_lhs: {self.cxt.display_string_for_line_separator(self.cxt.line_separator_for_lhs, FileArrangement.LHS)}')

724 print(f'line_separator_for_rhs: {self.cxt.display_string_for_line_separator(self.cxt.line_separator_for_rhs, FileArrangement.RHS)}')

725 print(f'quote_char_for_lhs: {self.cxt.quote_char_for_lhs}')

726 print(f'quote_char_for_rhs: {self.cxt.quote_char_for_rhs}')

727 print(f'skips_space_after_column_separator_for_lhs: {self.cxt.skips_space_after_column_separator_for_lhs}')

728 print(f'skips_space_after_column_separator_for_rhs: {self.cxt.skips_space_after_column_separator_for_rhs}')

729

730

731class DetailReporter:

732

733 __metaclass__ = abc.ABCMeta

734

735 def __init__(self, context):

736 self.cxt = context

737

738

739 def report_detail_heading(self):

740

741 if not self.cxt.shows_details:

742 return

743

744 self._report_content_heading()

745 self._report_file_name()

746

747 @spacing_before(1)

748 def _report_content_heading(self):

749 if self.cxt.shows_difference_only:

750 print('● Differences')

751 elif self.cxt.shows_all_lines:

752 print('● All')

753 else:

754 pass

755

756 @abc.abstractmethod

757 def _report_file_name(self):

758 raise NotImplementedError()

759

760

761 @abc.abstractmethod

762 def report_case_of_existed_only_on_lhs(self, lhs_fact):

763 raise NotImplementedError()

764

765 @abc.abstractmethod

766 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

767 raise NotImplementedError()

768

769 @abc.abstractmethod

770 def report_case_of_existed_only_on_rhs(self, rhs_fact):

771 raise NotImplementedError()

772

773

774 class Factory:

775

776 def __init__(self):

777 pass

778

779 @staticmethod

780 def reporter_for(context, scan_result):

781

782 if context.reports_in_vertical_style:

783 return VerticalReporter(context, scan_result)

784 else:

785 return HorizontalReporter(context, scan_result)

786

787

788class HorizontalReporter(DetailReporter):

789

790 class Template:

791

792 DIFFERENT_COLUMN_GUIDE = 'Column indices with difference'

793 PREFIX_of_DIFF_COLUMNS = ' @ '

794

795 def __init__(self, lhs_max_row_number_length, lhs_max_row_length, rhs_max_row_number_length, rhs_max_row_length):

796

797 self.lhs_max_row_number_length = lhs_max_row_number_length

798 self.lhs_filler_length = 1

799 self.lhs_max_row_length = lhs_max_row_length

800 self.diff_mark_filler_length_in_front = 2

801 self.diff_mark_length = 1

802 self.diff_mark_filler_length_in_rear = 2

803 self.rhs_max_row_number_length = rhs_max_row_number_length

804 self.rhs_filler_length = 1

805 self.rhs_max_row_length = rhs_max_row_length

806 self.prefix_length_for_diff_columns_displays = len(self.PREFIX_of_DIFF_COLUMNS)

807

808 self.lhs_length = self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length

809 self.diff_mark_length = self.diff_mark_filler_length_in_front + self.diff_mark_length + self.diff_mark_filler_length_in_rear

810 self.rhs_length = self.rhs_max_row_number_length + self.rhs_filler_length + self.rhs_max_row_length

811

812

813 # --- heading-related description ---

814

815 def division_string(self):

816 return '-' * (self.lhs_length + self.diff_mark_length + self.rhs_length + self.prefix_length_for_diff_columns_displays + len(self.DIFFERENT_COLUMN_GUIDE))

817

818 def file_name_description(self, lhs_file_name, rhs_file_name):

819

820 lhs_file_name = UnicodeSupport.left_justified(lhs_file_name, self.lhs_length)

821 diff_mark_spacing = ' ' * self.diff_mark_length

822 rhs_file_name = UnicodeSupport.left_justified(rhs_file_name, self.rhs_length)

823 prefix_length_spacing = ' ' * self.prefix_length_for_diff_columns_displays

824 return f'{lhs_file_name}{diff_mark_spacing}{rhs_file_name}{prefix_length_spacing}{self.DIFFERENT_COLUMN_GUIDE}'

825

826

827 # --- left-hand side related description ---

828

829 def lhs_only_description(self, lhs_fact):

830

831 lhs = self._lhs_description(lhs_fact)

832 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.LHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)

833 return f'{lhs}{diff_mark_area}'

834

835 def _lhs_description(self, lhs_fact):

836

837 lhs_row_number = UnicodeSupport.right_justified(str(lhs_fact.lhs_row_number), self.lhs_max_row_number_length)

838 spacing = ' ' * self.lhs_filler_length

839 lhs_row = UnicodeSupport.left_justified(str(lhs_fact.lhs_row), self.lhs_max_row_length)

840 return f'{lhs_row_number}{spacing}{lhs_row}'

841

842 def _lhs_empty_description(self):

843 return ' ' * (self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length)

844

845

846 # --- right-hand side related description ---

847

848 def rhs_only_description(self, rhs_fact):

849

850 empty_lhs = self._lhs_empty_description()

851 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.RHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)

852 rhs = self._rhs_description(rhs_fact)

853 return f'{empty_lhs}{diff_mark_area}{rhs}'

854

855 def _rhs_description(self, rhs_fact):

856

857 rhs_row_number = UnicodeSupport.right_justified(str(rhs_fact.rhs_row_number), self.rhs_max_row_number_length)

858 spacing = ' ' * self.rhs_filler_length

859 rhs_row = UnicodeSupport.left_justified(str(rhs_fact.rhs_row), self.rhs_max_row_length)

860 return f'{rhs_row_number}{spacing}{rhs_row}'

861

862

863 # --- both sides related description ---

864

865 def both_description(self, lhs_fact, rhs_fact, value_difference_result):

866

867 lhs = self._lhs_description(lhs_fact)

868 diff_mark = Mark.HAS_DIFF if value_difference_result.has_difference else Mark.NON_DIFF

869 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + diff_mark + (' ' * self.diff_mark_filler_length_in_rear)

870 rhs = self._rhs_description(rhs_fact)

871 prefix_of_diff_columns = self.PREFIX_of_DIFF_COLUMNS if value_difference_result.has_difference else ''

872 different_columns = str(value_difference_result.different_column_indices) if value_difference_result.has_difference else ''

873 return f'{lhs}{diff_mark_area}{rhs}{prefix_of_diff_columns}{different_columns}'

874

875

876 def __init__(self, context, scan_result):

877

878 super(HorizontalReporter, self).__init__(context)

879 self.cxt = context

880

881 if context.needs_size_info_for_padding:

882 size_info = scan_result.size_info_for_padding

883 self.template = HorizontalReporter.Template(len(str(size_info.lhs_max_row_number)),

884 size_info.lhs_max_row_length,

885 len(str(size_info.rhs_max_row_number)),

886 size_info.rhs_max_row_length)

887 else:

888 self.template = None

889

890

891 # --- report heading related ---

892

893 def _report_file_name(self):

894

895 print(self.template.division_string())

896 print(self.template.file_name_description(os.path.basename(self.cxt.lhs_file_name), os.path.basename(self.cxt.rhs_file_name)))

897 print(self.template.division_string())

898

899

900 # --- report each cases ---

901

902 def report_case_of_existed_only_on_lhs(self, lhs_fact):

903

904 if self.cxt.shows_details:

905 print(self.template.lhs_only_description(lhs_fact))

906

907 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

908

909 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:

910 print(self.template.both_description(lhs_fact, rhs_fact, value_difference_result))

911

912 def report_case_of_existed_only_on_rhs(self, rhs_fact):

913

914 if self.cxt.shows_details:

915 print(self.template.rhs_only_description(rhs_fact))

916

917

918class VerticalReporter(DetailReporter):

919

920 class Template:

921

922 LHS_MARK = 'L'

923 RHS_MARK = 'R'

924 PREFIX_of_DIFF_COLUMNS = '@'

925

926 def __init__(self):

927 pass

928

929

930 # --- heading-related description ---

931

932 @classmethod

933 def division_string(cls):

934 return '-' * 80

935

936 @classmethod

937 def file_name_description(cls, mark, file_name):

938 return f'{mark} {file_name}'

939

940

941 # --- left-hand side related description ---

942

943 @classmethod

944 def lhs_only_description(cls, lhs_fact):

945 return f'{Mark.LHS_ONLY} {cls.LHS_MARK} {str(lhs_fact.lhs_row_number)} {str(lhs_fact.lhs_row)}'

946

947

948 # --- right-hand side related description ---

949

950 @classmethod

951 def rhs_only_description(cls, rhs_fact):

952 return f'{Mark.RHS_ONLY} {cls.RHS_MARK} {str(rhs_fact.rhs_row_number)} {str(rhs_fact.rhs_row)}'

953

954

955 # --- both sides related description ---

956

957 @classmethod

958 def both_description_heading(cls, value_difference_result):

959

960 if value_difference_result.has_difference:

961 return f'{Mark.HAS_DIFF} {cls.PREFIX_of_DIFF_COLUMNS} {str(value_difference_result.different_column_indices)}'

962 else:

963 return Mark.NON_DIFF_EXPRESSLY

964

965 @classmethod

966 def both_description_lhs(cls, lhs_fact, row_number_length):

967 return f' {cls.LHS_MARK} {str(lhs_fact.lhs_row_number).rjust(row_number_length)} {str(lhs_fact.lhs_row)}'

968

969 @classmethod

970 def both_description_rhs(cls, rhs_fact, row_number_length):

971 return f' {cls.RHS_MARK} {str(rhs_fact.rhs_row_number).rjust(row_number_length)} {str(rhs_fact.rhs_row)}'

972

973

974

975 def __init__(self, context, _):

976

977 super(VerticalReporter, self).__init__(context)

978 self.cxt = context

979 self.template = VerticalReporter.Template()

980

981

982 # --- report heading related ---

983

984 def _report_file_name(self):

985

986 print(self.template.division_string())

987 print(self.template.file_name_description(self.template.LHS_MARK, os.path.basename(self.cxt.lhs_file_name)))

988 print(self.template.file_name_description(self.template.RHS_MARK, os.path.basename(self.cxt.rhs_file_name)))

989 print(self.template.division_string())

990

991

992 # --- report each cases ---

993

994 def report_case_of_existed_only_on_lhs(self, lhs_fact):

995

996 if self.cxt.shows_details:

997 print(self.template.lhs_only_description(lhs_fact))

998

999 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

1000

1001 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:

1002

1003 row_number_length = max(len(str(lhs_fact.lhs_row_number)), len(str(rhs_fact.rhs_row_number)))

1004

1005 print(self.template.both_description_heading(value_difference_result))

1006 print(self.template.both_description_lhs(lhs_fact, row_number_length))

1007 print(self.template.both_description_rhs(rhs_fact, row_number_length))

1008

1009 def report_case_of_existed_only_on_rhs(self, rhs_fact):

1010

1011 if self.cxt.shows_details:

1012 print(self.template.rhs_only_description(rhs_fact))

1013

1014

1015class CountReporter:

1016

1017 class Counter:

1018

1019 def __init__(self):

1020

1021 self.number_of_same_lines = 0

1022 self.number_of_lhs_only = 0

1023 self.number_of_rhs_only = 0

1024 self.number_of_differences = 0

1025

1026 self.row_numbers_for_lhs_only = []

1027 self.row_numbers_for_rhs_only = []

1028 self.row_numbers_for_differences = {}

1029

1030 self._max_digit = None

1031

1032 def _increment_same_lines(self):

1033 self.number_of_same_lines += 1

1034

1035 def _increment_lhs_only(self):

1036 self.number_of_lhs_only += 1

1037

1038 def _increment_rhs_only(self):

1039 self.number_of_rhs_only += 1

1040

1041 def _increment_differences(self):

1042 self.number_of_differences += 1

1043

1044 def _add_row_number_for_lhs_only(self, row_number):

1045 self.row_numbers_for_lhs_only.append(row_number)

1046

1047 def _add_row_number_for_rhs_only(self, row_number):

1048 self.row_numbers_for_rhs_only.append(row_number)

1049

1050 def _add_row_number_for_differences(self, lhs_row_number, rhs_row_number):

1051 self.row_numbers_for_differences[lhs_row_number] = rhs_row_number

1052

1053

1054 def count_for_case_of_existed_only_on_lhs(self, row_number):

1055 self._increment_lhs_only()

1056 self._add_row_number_for_lhs_only(row_number)

1057

1058 def count_for_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

1059

1060 if value_difference_result.has_difference:

1061 self._increment_differences()

1062 self._add_row_number_for_differences(lhs_fact.lhs_row_number, rhs_fact.rhs_row_number)

1063 else:

1064 self._increment_same_lines()

1065

1066 def count_for_case_of_existed_only_on_rhs(self, row_number):

1067 self._increment_rhs_only()

1068 self._add_row_number_for_rhs_only(row_number)

1069

1070 @property

1071 def sorted_row_numbers_for_differences(self):

1072 return sorted(self.row_numbers_for_differences.items(), key=lambda x: x[0])

1073

1074

1075 @property

1076 def max_digit(self):

1077

1078 if self._max_digit is not None:

1079 return self._max_digit

1080

1081 self._max_digit = max(

1082 len(str(self.number_of_same_lines)),

1083 len(str(self.number_of_lhs_only)),

1084 len(str(self.number_of_rhs_only)),

1085 len(str(self.number_of_differences)),

1086 )

1087 return self._max_digit

1088

1089

1090 def __init__(self, shows_count):

1091 self.shows_count = shows_count

1092 self.counter = self.Counter()

1093

1094

1095 def _func_of_right_justified_number(self):

1096 return lambda number: str(number).rjust(self.counter.max_digit)

1097

1098 @spacing_before(1)

1099 def report_count(self):

1100

1101 if not self.shows_count:

1102 return

1103

1104 print('● Count & Row number')

1105

1106 rjust = self._func_of_right_justified_number()

1107 print('same lines : {}'.format(rjust(self.counter.number_of_same_lines)))

1108 print('left side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.LHS_ONLY, rjust(self.counter.number_of_lhs_only), self.counter.row_numbers_for_lhs_only))

1109 print('right side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.RHS_ONLY, rjust(self.counter.number_of_rhs_only), self.counter.row_numbers_for_rhs_only))

1110 print('with differences ({}): {} :-- Row Number Pairs -->: {}'.format(Mark.HAS_DIFF, rjust(self.counter.number_of_differences), self.counter.sorted_row_numbers_for_differences))

1111

1112

1113class UnicodeSupport:

1114

1115 @classmethod

1116 def left_justified(cls, value, length):

1117 return f"{value}{' ' * (length - cls.string_length_considering_east_asian_characters_of(value))}"

1118

1119 @classmethod

1120 def right_justified(cls, value, length):

1121 return f"{' ' * (length - cls.string_length_considering_east_asian_characters_of(value))}{value}"

1122

1123 @staticmethod

1124 def string_length_considering_east_asian_characters_of(text):

1125 return functools.reduce(lambda counting, c: counting + (2 if unicodedata.east_asian_width(c) in 'FWA' else 1),

1126 text, 0)

1127

1128

1129# ----------------------------------------------------------------------------------------------------------------------

1130# CSV Reading

1131# ----------------------------------------------------------------------------------------------------------------------

1132

1133class FileArrangement(type):

1134

1135 LHS = '_for_lhs'

1136 RHS = '_for_rhs'

1137

1138

1139class CsvDialectFixer:

1140

1141 def __init__(self):

1142 pass

1143

1144 @classmethod

1145 def fixed_dialect(cls, context, csv_file, file_arrangement):

1146

1147 if context.forces_individual_specs:

1148 return cls._dialect_from_context(context, file_arrangement)

1149 else:

1150 return cls._try_sniffing(context, csv_file, file_arrangement)

1151

1152

1153 @classmethod

1154 def _dialect_from_context(cls, context, file_arrangement):

1155

1156 dialect = csv.excel()

1157 dialect.delimiter = getattr(context, "column_separator" + file_arrangement)

1158 dialect.lineterminator = getattr(context, "line_separator" + file_arrangement)

1159 dialect.quotechar = getattr(context, "quote_char" + file_arrangement)

1160 dialect.skipinitialspace = getattr(context, "skips_space_after_column_separator" + file_arrangement)

1161

1162 return dialect, context

1163

1164 @classmethod

1165 def _try_sniffing(cls, context, csv_file, file_arrangement):

1166

1167 try:

1168 return cls._sniff(context, csv_file, file_arrangement)

1169

1170 except csv.Error as e:

1171

1172 logger.warning(f'Sniffing failed. Generated a dialect from context instead. [type={type(e)}, args={str(e.args)}, message={traceback.format_exception_only(type(e), e)}]')

1173 return cls._dialect_from_context(context, file_arrangement)

1174

1175 finally:

1176 csv_file.seek(0)

1177

1178 @classmethod

1179 def _sniff(cls, context, csv_file, file_arrangement):

1180

1181 sample = csv_file.read(context.sniffing_size)

1182 sniffer = csv.Sniffer()

1183 dialect = sniffer.sniff(sample)

1184 has_header = sniffer.has_header(sample)

1185

1186 adjusted_context = cls._adjust_context_with(dialect, has_header, context, file_arrangement)

1187

1188 return dialect, adjusted_context

1189

1190 @classmethod

1191 def _adjust_context_with(cls, dialect, has_header, context, file_arrangement):

1192

1193 setattr(context, "column_separator" + file_arrangement, dialect.delimiter)

1194 setattr(context, "line_separator" + file_arrangement, dialect.lineterminator)

1195 setattr(context, "quote_char" + file_arrangement, dialect.quotechar)

1196 setattr(context, "skips_space_after_column_separator" + file_arrangement, dialect.skipinitialspace)

1197 context.first_row_is_header = has_header if context.header is None else (True if context.header == 'y' else False)

1198

1199 return context

1200

1201

1202def show_dialect_for_debugging(dialect, context, message, file_arrangement):

1203

1204 logger.debug(f'---{message}---')

1205 logger.debug(f'sniffing dialect={dialect}')

1206 logger.debug(f'sniffing dialect csv.excel={isinstance(dialect, csv.excel)}')

1207 logger.debug(f'sniffing dialect csv.excel_tab={isinstance(dialect, csv.excel_tab)}')

1208 logger.debug(f'sniffing dialect csv.unix_dialect={isinstance(dialect, csv.unix_dialect)}')

1209 logger.debug(f'sniffing dialect.delimiter={context.display_string_for_column_separator(dialect.delimiter)}')

1210 logger.debug(f'sniffing dialect.doublequote={dialect.doublequote}')

1211 logger.debug(f'sniffing dialect.escapechar={dialect.escapechar}')

1212 logger.debug(f'sniffing dialect.lineterminator={context.display_string_for_line_separator(dialect.lineterminator, file_arrangement)}')

1213 logger.debug(f'sniffing dialect.quotechar={dialect.quotechar}')

1214 logger.debug(f'sniffing dialect.quoting={dialect.quoting}')

1215 logger.debug(f'sniffing dialect.skipinitialspace={dialect.skipinitialspace}')

1219class LhsFact:

1221 def __init__(self, lhs_row_number, lhs_row, lhs_key):

1222

1223 logger.debug(f'LhsFact 生成 lhs_row_number={lhs_row_number}, lhs_row={lhs_row}, lhs_key={lhs_key}')

1224

1225 self.lhs_row_number = lhs_row_number

1226 self.lhs_row = lhs_row

1227 self.lhs_key = lhs_key

1228

1229

1230class RhsFact:

1231

1232 def __init__(self, rhs_row_number, rhs_row, rhs_key):

1233

1234 logger.debug(f'RhsFact 生成 rhs_row_number={rhs_row_number}, rhs_row={rhs_row}, rhs_key={rhs_key}')

1235

1236 self.rhs_row_number = rhs_row_number

1237 self.rhs_row = rhs_row

1238 self.rhs_key = rhs_key

1241class CsvReader:

1243 class State:

1245 def __init__(self, csv_file, dialect, file_name, first_row_is_header):

1246

1247 self._csv_file = csv_file

1248 self._dialect = dialect

1249 self._file_name = file_name

1250 self._first_row_is_header = first_row_is_header

1251

1252 self._csv_reader = csv.reader(csv_file, dialect)

1253 self._row_number = 0

1254 self._previous_key = ""

1255

1256 def reset(self):

1257

1258 self._csv_file.seek(0)

1259 self._csv_reader = csv.reader(self._csv_file, self._dialect)

1260 self._row_number = 0

1261 self._previous_key = ""

1262

1263 def increment_row_number(self):

1264

1265 if self._previous_key == MatchingKeyCodec.END_of_KEY:

1266 return

1267

1268 self._row_number += 1

1269

1270 def key_changed(self, new_key):

1271

1272 if self._is_header():

1273 return

1274

1275 self._previous_key = new_key

1276

1277 def _is_header(self):

1278 return self.row_number == 0 and self._first_row_is_header

1279

1280 @property

1281 def csv_reader(self):

1282 return self._csv_reader

1283

1284 @property

1285 def file_name(self):

1286 return self._file_name

1287

1288 @property

1289 def row_number(self):

1290 return self._row_number

1291

1292 @property

1293 def previous_key(self):

1294 return self._previous_key

1295

1296

1297 def __init__(self, lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, context):

1298

1299 show_dialect_for_debugging(lhs_dialect, context, '左CSV', FileArrangement.LHS)

1300 show_dialect_for_debugging(rhs_dialect, context, '右CSV', FileArrangement.RHS)

1301

1302 self.lhs_csv_state = CsvReader.State(lhs_csv, lhs_dialect, context.lhs_file_name, context.first_row_is_header)

1303 self.rhs_csv_state = CsvReader.State(rhs_csv, rhs_dialect, context.rhs_file_name, context.first_row_is_header)

1304 self.cxt = context

1305

1306 self.skip_header()

1307

1308 def skip_header(self):

1309

1310 if self.cxt.first_row_is_header:

1311 _ = self.read_lhs()

1312 _ = self.read_rhs()

1313

1314 def reset(self):

1315

1316 self.lhs_csv_state.reset()

1317 self.rhs_csv_state.reset()

1318 self.skip_header()

1319

1320 def read_lhs(self):

1321

1322 lhs_row, lhs_key = self._read_csv(self.lhs_csv_state)

1323 self.lhs_csv_state.increment_row_number()

1324 return LhsFact(self.lhs_csv_state.row_number, lhs_row, lhs_key)

1325

1326 def read_rhs(self):

1327

1328 rhs_row, rhs_key = self._read_csv(self.rhs_csv_state)

1329 self.rhs_csv_state.increment_row_number()

1330 return RhsFact(self.rhs_csv_state.row_number, rhs_row, rhs_key)

1331

1332 def _read_csv(self, csv_state):

1333

1334 try:

1335 row = next(csv_state.csv_reader)

1336 except StopIteration:

1337 csv_state.key_changed(MatchingKeyCodec.END_of_KEY)

1338 return [], MatchingKeyCodec.END_of_KEY

1339

1340 new_key = self.cxt.matching_key_codec.managed_key_for(row)

1341 self._detect_key_violation(new_key, csv_state)

1342

1343 csv_state.key_changed(new_key)

1344

1345 return row, new_key

1346

1347 def _detect_key_violation(self, new_key, csv_state):

1348

1349 if csv_state.previous_key == '':

1350 return

1351

1352 if new_key < csv_state.previous_key:

1353 logger.error(f'matching keys in {csv_state.file_name} are not sorted.'

1354 f' [current_key={MatchingKeyCodec.decode_key(new_key)}, previous_key={MatchingKeyCodec.decode_key(csv_state.previous_key)}, matching-key-indices={self.cxt.matching_key_codec.matching_key_info_list}]'

1355 f' If the key is a number without zero padding, specify the max size of the key after colon like -k0:8.')

1356 exit(1)

1357

1358 if self.cxt.key_should_be_unique and new_key == csv_state.previous_key:

1359 logger.error(f'matching keys in {csv_state.file_name} are not unique.'

1360 f' [current_key={MatchingKeyCodec.decode_key(new_key)}, previous_key={MatchingKeyCodec.decode_key(csv_state.previous_key)}, matching-key-indices={self.cxt.matching_key_codec.matching_key_info_list}]')

1361 exit(1)

1362

1363

1364if __name__ == '__main__':

1366 main()

Coverage for src/csvdiff3/csvdiff.py : 96%

784 statements

Coverage for src/csvdiff3/csvdiff.py : 96%

784 statements 755 run 29 missing 0 excluded

784 statements