Coverage for src/csvdiff2/csvdiff.py: 96%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1#!/usr/bin/env python2

2# -*- coding: utf-8 -*-

3import abc

4import csv

5import functools

6import logging

7import os

8import sys

9import timeit

10from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

13# ----------------------------------------------------------------------------------------------------------------------

14# Decorators

15# ----------------------------------------------------------------------------------------------------------------------

17def show_execution_time():

19 def _execution_time(func):

21 def wrapper(*args, **kwargs):

23 start = timeit.default_timer()

25 func(*args, **kwargs)

27 elapsed_time = timeit.default_timer() - start

28 print

29 print("elapsed_time={0}".format(elapsed_time) + "[sec]")

30 print

32 return wrapper

34 return _execution_time

37def spacing_before(number_of_lines):

39 number_of_lines = number_of_lines or 1

41 def _spacing_before(func):

43 def wrapper(*args, **kwargs):

45 for i in range(number_of_lines):

46 print('')

48 func(*args, **kwargs)

50 return wrapper

52 return _spacing_before

55# ----------------------------------------------------------------------------------------------------------------------

56# Entrance

57# ----------------------------------------------------------------------------------------------------------------------

59# @show_execution_time()

60def main():

62 configure()

64 context = context_from_arguments()

65 show_context_for_debugging(context)

67 try:

68 run_in(context)

69 except IndexError as e:

70 logger.error('It is possible that the number of columns in the row is not aligned. Please check the csv data. If not, please file an issue. [{}, description={}]'.format(type(e), e))

71 sys.exit(1)

74class App(type):

76 VERSION = '1.0.0'

79class LoggingConfig(type):

81 # If you want to debug, play with the CONSOLE_LEVEL or FILE_LEVEL.

83 BASE_LEVEL = logging.DEBUG

85 CONSOLE_LEVEL = logging.ERROR

86 CONSOLE_FORMAT = '%(levelname)s: %(message)s'

88 FILE_LEVEL = logging.WARNING

89 FILE_FORMAT = '%(asctime)s: %(levelname)s: %(message)s'

90 FILE_PATH = 'csvdiff.log'

93logger = logging.getLogger(__name__)

96def configure():

98 logging.basicConfig(level=LoggingConfig.BASE_LEVEL)

100 stream_handler = logging.StreamHandler()

101 stream_handler.setLevel(LoggingConfig.CONSOLE_LEVEL)

102 stream_handler.setFormatter(logging.Formatter(LoggingConfig.CONSOLE_FORMAT))

103

104 file_handler = logging.FileHandler(filename=LoggingConfig.FILE_PATH, mode='w')

105 file_handler.setLevel(LoggingConfig.FILE_LEVEL)

106 file_handler.setFormatter(logging.Formatter(LoggingConfig.FILE_FORMAT))

107

108 logger.addHandler(stream_handler)

109 logger.addHandler(file_handler)

110

111 logger.propagate = False

112

113

114# ----------------------------------------------------------------------------------------------------------------------

115# Context Preparation

116# ----------------------------------------------------------------------------------------------------------------------

117

118def context_from_arguments():

119

120 def arg_type_matching_key_in_csv(x):

121 return map(MatchingKeyInfo, x.split(','))

122

123 def arg_type_int_in_csv(x):

124 return map(int, x.split(','))

125

126 parser = ArgumentParser(prog='csv-diff-python2@blue-monk', formatter_class=ArgumentDefaultsHelpFormatter)

127

128 # Program name & Version -------------------------------------------------------------------------------------------

129 parser.add_argument('--version', action='version', version='%(prog)s {}'.format(App.VERSION))

130

131 # Input CSV file paths ---------------------------------------------------------------------------------------------

132 parser.add_argument('lhs_file_name', type=str, help='Absolute/Relative path to left-hand side file.')

133 parser.add_argument('rhs_file_name', type=str, help='Absolute/Relative path to right-hand side file.')

134

135 # Matching conditions ----------------------------------------------------------------------------------------------

136 parser.add_argument('-k', '--matching-keys', type=arg_type_matching_key_in_csv, default='0',

137 help='Matching key indices(from 0) for Input CSV in CSV format. For non-fixed length numbers, specify the number of digits after ":". e.g.: 0:8,3')

138 parser.add_argument('-u', '--unique-key', default=False, action='store_true',

139 help="Specify if the matching key is unique. Then, if it detects that the matching key is not unique, an error will occur.")

140 parser.add_argument('-i', '--ignore-columns', type=arg_type_int_in_csv, default=[],

141 help='Specify the index of the column to be ignored in CSV format. e.g.: 3,7')

142

143 # Report styles ----------------------------------------------------------------------------------------------------

144 parser.add_argument('-v', '--vertical-style', default=False, action='store_true',

145 help='Report in vertical style. If not specified, report in horizontal(two facing) style.')

146

147 parser.add_argument('-c', '--show-count', default=False, action='store_true',

148 help='Report the number of differences. Treat this as True if neither -d nor -a is specified.')

149

150 display_group = parser.add_mutually_exclusive_group()

151 display_group.add_argument('-d', '--show-difference-only', default=False, action='store_true',

152 help='Report the lines with the difference. Can be used with option -c. Cannot be used with option -a.')

153 display_group.add_argument('-a', '--show-all-lines', action='store_true',

154 help='Report on all lines. Can be used with option -c. Cannot be used with option -d.')

155

156 parser.add_argument('-x', '--show-context-from-arguments', default=False, action='store_true',

157 help='Report the context generated from the arguments and CSV sniffing.')

158

159 # CSV analysis conditions ------------------------------------------------------------------------------------------

160 parser.add_argument('-H', '--header', type=str, default=None, choices=['n', 'y'],

161 help='If specified, this specification will be enforced.')

162

163 parser.add_argument('-S', '--sniffing-size', type=str, default=4096,

164 help="If csv sniffing fails, try specifying a size larger than 4096. Or Explicitly specify CSV file conditions like '--column-separator-for-lhs TAB'. Check help with -h option.")

165

166 parser.add_argument('-F', '--force-individual-specs', action='store_true',

167 help="If you don't want to rely on csv sniffing, specify it, and then specify --column-separator and so on separately.")

168

169 parser.add_argument('--column-separator', type=str, default=None, choices=['COMMA', 'TAB', 'SEMICOLON'],

170 help='Process both sides CSV file using the specified column delimiter.')

171

172 parser.add_argument('--line-separator', type=str, default=None, choices=['LF', 'CRLF'],

173 help='Process both sides CSV file using the specified line separator.')

174

175 parser.add_argument('--quote-char', type=str, default=None, choices=['"', "'"],

176 help='Process both sides CSV file using the specified quote character.')

177

178 parser.add_argument('--no-skip-space-after-column-separator', action='store_true',

179 help='Specify when you want to treat the space immediately after the separator as data for the both sides CSV file.')

180

181 # CSV analysis conditions by left and right ------------------------------------------------------------------------

182 parser.add_argument('--column-separator-for-lhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],

183 help='Process left-hand side CSV file using the specified column delimiter.')

184

185 parser.add_argument('--column-separator-for-rhs', type=str, default="COMMA", choices=['COMMA', 'TAB', 'SEMICOLON'],

186 help='Process right-hand side CSV file using the specified column delimiter.')

187

188 parser.add_argument('--line-separator-for-lhs', type=str, default="LF", choices=['LF', 'CRLF'],

189 help='Process left-hand side CSV file using the specified line separator.')

190

191 parser.add_argument('--line-separator-for-rhs', type=str, default="LF", choices=['LF', 'CRLF'],

192 help='Process right-hand side CSV file using the specified line separator.')

193

194 parser.add_argument('--quote-char-for-lhs', type=str, default='"', choices=['"', "'"],

195 help='Process left-hand side CSV file using the specified quote character.')

196

197 parser.add_argument('--quote-char-for-rhs', type=str, default='"', choices=['"', "'"],

198 help='Process right-hand side CSV file using the specified quote character.')

199

200 parser.add_argument('--no-skip-space-after-column-separator-for-lhs', default=False, action='store_true',

201 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the left side.')

202

203 parser.add_argument('--no-skip-space-after-column-separator-for-rhs', default=False, action='store_true',

204 help='Specify when you want to treat the space immediately after the separator as data for the CSV file on the right side.')

205

206 # ------------------------------------------------------------------------------------------------------------------

207

208 return Context(parser.parse_args())

209

210

211class Context:

212

213 LINE_SEPARATOR_s = {

214 "CR": '\r',

215 "LF": '\n',

216 "CRLF": '\r\n',

217 None: '<None>',

218 }

219

220 COLUMN_SEPARATOR_s = {

221 "COMMA": ',',

222 "TAB": '\t',

223 "SEMICOLON": ';',

224 None: '<None>',

225 }

226

227 def __init__(self, args):

228

229 # Input CSV file paths ---------------------------------------------------------------------------------------------

230 self.lhs_file_name = args.lhs_file_name

231 self.rhs_file_name = args.rhs_file_name

232 self.lhs_file_path = os.path.abspath(args.lhs_file_name)

233 self.rhs_file_path = os.path.abspath(args.rhs_file_name)

234

235 # Matching conditions ----------------------------------------------------------------------------------------------

236 self.matching_key_codec = MatchingKeyCodec(args.matching_keys)

237 self.key_should_be_unique = args.unique_key

238 self.column_indices_to_ignore = args.ignore_columns

239

240 # Report styles ----------------------------------------------------------------------------------------------------

241 self.reports_in_vertical_style = args.vertical_style

242 self.reports_in_horizontal_style = not args.vertical_style

243

244 self.shows_count = args.show_count

245 self.shows_difference_only = args.show_difference_only

246 self.shows_all_lines = args.show_all_lines

247 self.shows_details = True if self.shows_difference_only or self.shows_all_lines else False

248 self.shows_context_from_arguments = args.show_context_from_arguments

249

250 self.needs_size_info_for_padding = self.shows_details and self.reports_in_horizontal_style

251

252 # CSV analysis conditions ------------------------------------------------------------------------------------------

253 self.header = args.header

254 self.first_row_is_header = None

255

256 self.sniffing_size = args.sniffing_size

257

258 self.forces_individual_specs = args.force_individual_specs

259

260 if self.forces_individual_specs and args.column_separator:

261 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator]

262 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator]

263 else:

264 self.column_separator_for_lhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_lhs]

265 self.column_separator_for_rhs = self.COLUMN_SEPARATOR_s[args.column_separator_for_rhs]

266

267 if self.forces_individual_specs and args.line_separator:

268 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator]

269 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator]

270 else:

271 self.line_separator_for_lhs = self.LINE_SEPARATOR_s[args.line_separator_for_lhs]

272 self.line_separator_for_rhs = self.LINE_SEPARATOR_s[args.line_separator_for_rhs]

273

274 if self.forces_individual_specs and args.quote_char:

275 self.quote_char_for_lhs = args.quote_char

276 self.quote_char_for_rhs = args.quote_char

277 else:

278 self.quote_char_for_lhs = args.quote_char_for_lhs

279 self.quote_char_for_rhs = args.quote_char_for_rhs

280

281 if self.forces_individual_specs and args.no_skip_space_after_column_separator:

282 self.skips_space_after_column_separator_for_lhs = not args.no_skip_space_after_column_separator

283 self.skips_space_after_column_separator_for_rhs = not args.no_skip_space_after_column_separator

284 else:

285 self.skips_space_after_column_separator_for_lhs = True

286 self.skips_space_after_column_separator_for_rhs = True

287

288 self._validate()

289 self._normalize()

290

291 def _validate(self):

292

293 if not os.path.exists(self.lhs_file_path):

294 logger.error('lhs_file_path not exists. [lhs_file_path={}]'.format(self.lhs_file_path))

295 sys.exit(1)

296 if not os.path.exists(self.rhs_file_path):

297 logger.error('rhs_file_path not exists. [rhs_file_path={}]'.format(self.rhs_file_path))

298 sys.exit(1)

299

300 if not os.path.isfile(self.lhs_file_path):

301 logger.error('lhs_file_path is not a file. [lhs_file_path={}]'.format(self.lhs_file_path))

302 sys.exit(1)

303 if not os.path.isfile(self.rhs_file_path):

304 logger.error('rhs_file_path is not a file. [rhs_file_path={}]'.format(self.rhs_file_path))

305 sys.exit(1)

306

307 def _normalize(self):

308

309 if not any([self.shows_count, self.shows_difference_only, self.shows_all_lines]):

310 self.shows_count = True

311

312 def display_string_for_column_separator(self, value):

313

314 candidates = [k for k, v in self.COLUMN_SEPARATOR_s.items() if v == value]

315 if candidates:

316 return candidates[0]

317 else:

318 'undefined({})'.format(value)

319

320

321def show_context_for_debugging(cxt):

322

323 logger.debug('lhs_file_name={}'.format(cxt.lhs_file_name))

324 logger.debug('rhs_file_name={}'.format(cxt.rhs_file_name))

325 logger.debug('lhs_file_path={}'.format(cxt.lhs_file_path))

326 logger.debug('rhs_file_path={}'.format(cxt.rhs_file_path))

327

328 logger.debug('matching_key_codec={}'.format(cxt.matching_key_codec))

329 logger.debug('key_should_be_unique={}'.format(cxt.key_should_be_unique))

330 logger.debug('column_indices_to_ignore={}'.format(cxt.column_indices_to_ignore))

331

332 logger.debug('reports_in_vertical_style={}'.format(cxt.reports_in_vertical_style))

333 logger.debug('reports_in_horizontal_style={}'.format(cxt.reports_in_horizontal_style))

334 logger.debug('shows_count={}'.format(cxt.shows_count))

335 logger.debug('shows_difference_only={}'.format(cxt.shows_difference_only))

336 logger.debug('shows_all_lines={}'.format(cxt.shows_all_lines))

337 logger.debug('shows_context_from_arguments={}'.format(cxt.shows_context_from_arguments))

338 logger.debug('needs_size_info_for_padding={}'.format(cxt.needs_size_info_for_padding))

339

340 logger.debug('first_row_is_header={}'.format(cxt.first_row_is_header))

341 logger.debug('sniffing_size={}'.format(cxt.sniffing_size))

342 logger.debug('force_individual_specs={}'.format(cxt.forces_individual_specs))

343

344 logger.debug('column_separator_for_lhs={}'.format(cxt.display_string_for_column_separator(cxt.column_separator_for_lhs)))

345 logger.debug('column_separator_for_rhs={}'.format(cxt.display_string_for_column_separator(cxt.column_separator_for_rhs)))

346 logger.debug('line_separator_for_lhs={}'.format(cxt.line_separator_for_lhs.encode('hex')))

347 logger.debug('line_separator_for_rhs={}'.format(cxt.line_separator_for_rhs.encode('hex')))

348 logger.debug('quote_char_for_lhs={}'.format(cxt.quote_char_for_lhs))

349 logger.debug('quote_char_for_rhs={}'.format(cxt.quote_char_for_rhs))

350 logger.debug('skips_space_after_column_separator_for_lhs={}'.format(cxt.skips_space_after_column_separator_for_lhs))

351 logger.debug('skips_space_after_column_separator_for_rhs={}'.format(cxt.skips_space_after_column_separator_for_rhs))

352

353 logger.debug('MatchingKeyCodec#END_of_KEY={}'.format(MatchingKeyCodec.END_of_KEY))

354

355

356# ----------------------------------------------------------------------------------------------------------------------

357# Matching Key Treatment

358# ----------------------------------------------------------------------------------------------------------------------

359

360class MatchingKeyInfo:

361

362 def __init__(self, specified_string):

363

364 elements = filter(lambda x: x != '', specified_string.split(':'))

365

366 index = elements.pop(0)

367 self.index = self._transform_into_numeric(index, 'index')

368

369 max_length = elements.pop(0) if elements else '0'

370 self.max_length = self._transform_into_numeric(max_length, 'max_length')

371

372 def __repr__(self):

373 return '{}({!r}, {!r})'.format(self.__class__.__name__, self.index, self.max_length if self.max_length > 0 else '<not specified>')

374

375 @classmethod

376 def _transform_into_numeric(cls, value, name):

377

378 if not value.isdigit():

379 logger.error('MATCHING_KEY_INDICES should be a number. See also help. [specified {}={}]'.format(name, value))

380 exit(1)

381

382 return int(value)

383

384 def key_for(self, row):

385 return row[self.index].rjust(self.max_length, '0')

386

387

388class MatchingKeyCodec:

389

390 END_of_KEY = 'ZZZ'

391 SEPARATOR = '..'

392

393 def __init__(self, matching_key_info_list):

394 self.matching_key_info_list = matching_key_info_list

395

396 def __repr__(self):

397 return '{}({!r})'.format(self.__class__.__name__, self.matching_key_info_list)

398

399 def managed_key_for(self, row):

400

401 try:

402 return functools.reduce(lambda making, matching_key: making + matching_key.key_for(row) + self.SEPARATOR,

403 self.matching_key_info_list, self.SEPARATOR)

404 except IndexError:

405 logger.error('one of the indices specified for MATCHING_KEY_INDICES is out of range [MATCHING_KEY_INDICES={}, number of columns = {}, row={}]'.format(self.matching_key_info_list, len(row), row))

406 exit(1)

407

408 @property

409 def matching_key_indices(self):

410 return map(lambda matching_key_info: matching_key_info.index, self.matching_key_info_list)

411

412 @classmethod

413 def decode_key(cls, key):

414 """ Leave the padding as it is. """

415 return key.strip(cls.SEPARATOR).split(cls.SEPARATOR)

416

417

418# ----------------------------------------------------------------------------------------------------------------------

419# Control and Determine if it exists only on the left, only on the right, or both

420# ----------------------------------------------------------------------------------------------------------------------

421

422def run_in(context):

423

424 with open(context.lhs_file_path, mode='r') as lhs_csv,\

425 open(context.rhs_file_path, mode='r') as rhs_csv:

426

427 lhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(context, lhs_csv, FileArrangement.LHS)

428 rhs_dialect, adjusted_context = CsvDialectFixer.fixed_dialect(adjusted_context, rhs_csv, FileArrangement.RHS)

429

430 csv_reader = CsvReader(lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, adjusted_context)

431 pre_scan_result = PreScanner.scan(adjusted_context, csv_reader)

432 csv_reader.reset()

433

434 detect_diff(adjusted_context, csv_reader, pre_scan_result)

435

436

437def detect_diff(context, csv_reader, pre_scan_result):

438

439 value_difference_detector = ValueDifferenceDetector(pre_scan_result.number_of_columns,

440 context.matching_key_codec.matching_key_indices,

441 context.column_indices_to_ignore)

442

443 heading_reporter = HeadingReporter(context)

444 detail_reporter = DetailReporter.Factory.reporter_for(context, pre_scan_result)

445 count_reporter = CountReporter(context.shows_count)

446 counter = count_reporter.counter

447

448 heading_reporter.report_heading()

449 detail_reporter.report_detail_heading()

450

451

452 def existed_only_on_lhs(lhs_fact):

453 counter.count_for_case_of_existed_only_on_lhs(lhs_fact.lhs_row_number)

454 detail_reporter.report_case_of_existed_only_on_lhs(lhs_fact)

455

456 def existed_on_both_sides(lhs_fact, rhs_fact):

457 value_difference_result = value_difference_detector.detect_difference_between(lhs_fact.lhs_row, rhs_fact.rhs_row)

458 counter.count_for_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)

459 detail_reporter.report_case_of_existed_on_both_sides(lhs_fact, rhs_fact, value_difference_result)

460

461 def existed_only_on_rhs(rhs_fact):

462 counter.count_for_case_of_existed_only_on_rhs(rhs_fact.rhs_row_number)

463 detail_reporter.report_case_of_existed_only_on_rhs(rhs_fact)

464

465 perform_key_matching(csv_reader, existed_only_on_lhs, existed_on_both_sides, existed_only_on_rhs)

466

467

468 count_reporter.report_count()

469

470

471def perform_key_matching(csv_reader, callback_for_lhs_only, callback_for_both_sides, callback_for_rhs_only):

472

473 lhs_fact = csv_reader.read_lhs()

474 rhs_fact = csv_reader.read_rhs()

475

476 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY or rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:

477

478 if lhs_fact.lhs_key < rhs_fact.rhs_key:

479 callback_for_lhs_only(lhs_fact)

480 lhs_fact = csv_reader.read_lhs()

481

482 elif lhs_fact.lhs_key == rhs_fact.rhs_key:

483 callback_for_both_sides(lhs_fact, rhs_fact)

484 lhs_fact = csv_reader.read_lhs()

485 rhs_fact = csv_reader.read_rhs()

486

487 elif lhs_fact.lhs_key > rhs_fact.rhs_key:

488 callback_for_rhs_only(rhs_fact)

489 rhs_fact = csv_reader.read_rhs()

490

491

492# ----------------------------------------------------------------------------------------------------------------------

493# Value-Difference Detection

494# ----------------------------------------------------------------------------------------------------------------------

495

496class ValueDifferenceDetector:

497

498 class ValueDifferenceResult:

499

500 def __init__(self, different_column_indices):

501

502 self.different_column_indices = different_column_indices

503

504 @property

505 def has_difference(self):

506 return True if self.different_column_indices else False

507

508

509 def __init__(self, number_of_columns, matching_key_indices, ignore_column_indices):

510

511 self.column_indices = range(0, number_of_columns)

512 logger.debug('column_indices={}'.format(self.column_indices))

513

514 self.target_column_indices = set(self.column_indices) - set(matching_key_indices) - set(ignore_column_indices)

515 logger.debug('target_column_indices={}'.format(self.target_column_indices))

516

517 def detect_difference_between(self, lhs_row, rhs_row):

518

519 different_column_indices = [index for index in self.target_column_indices if lhs_row[index] != rhs_row[index]]

520 logger.debug('different_column_indices={}'.format(different_column_indices))

521 return self.ValueDifferenceResult(different_column_indices)

522

523

524# ----------------------------------------------------------------------------------------------------------------------

525# Reporting

526# ----------------------------------------------------------------------------------------------------------------------

527

528class PreScanner:

529

530 class ScanResult:

531

532 def __init__(self, number_of_columns, size_info_for_padding):

533 self.number_of_columns = number_of_columns

534 self.size_info_for_padding = size_info_for_padding

535

536 @classmethod

537 def for_lightly(cls, number_of_columns):

538 return PreScanner.ScanResult(number_of_columns, None)

539

540 @classmethod

541 def for_deeply(cls, number_of_columns, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):

542 size_info_for_padding = cls.SizeInfoForPadding(lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)

543 return PreScanner.ScanResult(number_of_columns, size_info_for_padding)

544

545

546 class SizeInfoForPadding:

547

548 def __init__(self, lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length):

549 self.lhs_max_row_number = lhs_max_row_number

550 self.lhs_max_row_length = lhs_max_row_length

551 self.rhs_max_row_number = rhs_max_row_number

552 self.rhs_max_row_length = rhs_max_row_length

553

554

555 def __init__(self):

556 pass

557

558 @classmethod

559 def scan(cls, context, csv_reader):

560

561 if context.needs_size_info_for_padding:

562 return PreScanner._scan_deeply(csv_reader)

563 else:

564 return PreScanner._scan_lightly(csv_reader)

565

566

567 @classmethod

568 def _scan_deeply(cls, csv_reader):

569 """

570 Notes

571 -----

572 Purpose of deep pre-scanning

573 * Determine the number of columns for value difference detection

574 * Get size information to format the horizontal report

575 """

576 start_ = timeit.default_timer()

577

578 lhs_max_row_length, rhs_max_row_length = 0, 0

579

580 lhs_fact = csv_reader.read_lhs()

581 rhs_fact = csv_reader.read_rhs()

582

583 number_of_columns = cls._determine_number_of_columns_from(lhs_fact, rhs_fact)

584

585 while lhs_fact.lhs_key != MatchingKeyCodec.END_of_KEY:

586 lhs_max_row_length = max(lhs_max_row_length, len(str(lhs_fact.lhs_row)))

587 lhs_fact = csv_reader.read_lhs()

588

589 while rhs_fact.rhs_key != MatchingKeyCodec.END_of_KEY:

590 rhs_max_row_length = max(rhs_max_row_length, len(str(rhs_fact.rhs_row)))

591 rhs_fact = csv_reader.read_rhs()

592

593 lhs_max_row_number = csv_reader.lhs_csv_state.row_number

594 rhs_max_row_number = csv_reader.rhs_csv_state.row_number

595 logger.debug('lhs_max_row_number={}'.format(lhs_max_row_number))

596 logger.debug('rhs_max_row_number={}'.format(rhs_max_row_number))

597

598 elapsed_time_ = timeit.default_timer() - start_

599 logger.debug("PreScanner#scan() elapsed_time:{0}".format(elapsed_time_) + "[sec]")

600 return PreScanner.ScanResult.for_deeply(number_of_columns,

601 lhs_max_row_number, lhs_max_row_length, rhs_max_row_number, rhs_max_row_length)

602

603 @classmethod

604 def _scan_lightly(cls, csv_reader):

605 """

606 Notes

607 -----

608 Purpose of light pre-scanning

609 * Determine the number of columns for value difference detection

610

611 Vertical reports do not require size information for formatting.

612 """

613

614 lhs_fact = csv_reader.read_lhs()

615 rhs_fact = csv_reader.read_rhs()

616

617 return PreScanner.ScanResult.for_lightly(cls._determine_number_of_columns_from(lhs_fact, rhs_fact))

618

619 @classmethod

620 def _determine_number_of_columns_from(cls, lhs_fact, rhs_fact):

621

622 number_of_columns = 0

623 if lhs_fact.lhs_row:

624 number_of_columns = len(lhs_fact.lhs_row)

625 elif rhs_fact.rhs_row:

626 number_of_columns = len(rhs_fact.rhs_row)

627

628 return number_of_columns

629

630

631class Mark(type):

632

633 LHS_ONLY = '<'

634 RHS_ONLY = '>'

635 HAS_DIFF = '!'

636 NON_DIFF = ' '

637 NON_DIFF_EXPRESSLY = '='

638

639

640class HeadingReporter:

641

642 def __init__(self, context):

643 self.cxt = context

644

645

646 def report_heading(self):

647

648 self._report_title()

649

650 if self.cxt.shows_context_from_arguments:

651 self._report_context()

652

653 @classmethod

654 @spacing_before(1)

655 def _report_title(cls):

656 print('============ Report ============')

657

658 @spacing_before(1)

659 def _report_context(self):

660

661 print('* Context')

662 print('File Path on the Left-Hand Side: {}'.format(self.cxt.lhs_file_path))

663 print('File Path on the Right-Hand Side : {}'.format(self.cxt.rhs_file_path))

664 print('Matching Key Indices: {}'.format(self.cxt.matching_key_codec.matching_key_info_list))

665 print('Matching Key Is Unique?: {}'.format(self.cxt.key_should_be_unique))

666 print('Column Indices to Ignore: {}'.format(self.cxt.column_indices_to_ignore))

667 print('with Header?: {}'.format(self.cxt.first_row_is_header))

668 print('Report Style: {}'.format('Vertical' if self.cxt.reports_in_vertical_style else 'Two facing (Horizontal)'))

669 print('Show Count?: {}'.format(self.cxt.shows_count))

670 print('Show Difference Only?: {}'.format(self.cxt.shows_difference_only))

671 print('Show All?: {}'.format(self.cxt.shows_all_lines))

672 print('Show Context?: {}'.format(self.cxt.shows_context_from_arguments))

673 print('CSV Sniffing Size: {}'.format(self.cxt.sniffing_size))

674 print('--- csv analysis conditions ---')

675 print('Forces Individual Specified Conditions?: {}'.format(self.cxt.forces_individual_specs))

676 print('column_separator_for_lhs: {}'.format(self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_lhs))) # DONE: タブのときの表示

677 print('column_separator_for_rhs: {}'.format(self.cxt.display_string_for_column_separator(self.cxt.column_separator_for_rhs))) # DONE: タブのときの表示

678 print('line_separator_for_lhs: {}'.format(self.cxt.line_separator_for_lhs.encode('hex')))

679 print('line_separator_for_rhs: {}'.format(self.cxt.line_separator_for_rhs.encode('hex')))

680 print('quote_char_for_lhs: {}'.format(self.cxt.quote_char_for_lhs))

681 print('quote_char_for_rhs: {}'.format(self.cxt.quote_char_for_rhs))

682 print('skips_space_after_column_separator_for_lhs: {}'.format(self.cxt.skips_space_after_column_separator_for_lhs))

683 print('skips_space_after_column_separator_for_rhs: {}'.format(self.cxt.skips_space_after_column_separator_for_rhs))

684

685

686class DetailReporter:

687

688 __metaclass__ = abc.ABCMeta

689

690 def __init__(self, context):

691 self.cxt = context

692

693

694 def report_detail_heading(self):

695

696 if not self.cxt.shows_details:

697 return

698

699 self._report_content_heading()

700 self._report_file_name()

701

702 @spacing_before(1)

703 def _report_content_heading(self):

704

705 if self.cxt.shows_difference_only:

706 print('* Differences')

707 elif self.cxt.shows_all_lines:

708 print('* All')

709 else:

710 pass

711

712 @abc.abstractmethod

713 def _report_file_name(self):

714 raise NotImplementedError()

715

716

717 @abc.abstractmethod

718 def report_case_of_existed_only_on_lhs(self, lhs_fact):

719 raise NotImplementedError()

720

721 @abc.abstractmethod

722 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

723 raise NotImplementedError()

724

725 @abc.abstractmethod

726 def report_case_of_existed_only_on_rhs(self, rhs_fact):

727 raise NotImplementedError()

728

729

730 class Factory:

731

732 def __init__(self):

733 pass

734

735 @staticmethod

736 def reporter_for(context, scan_result):

737

738 if context.reports_in_vertical_style:

739 return VerticalReporter(context, scan_result)

740 else:

741 return HorizontalReporter(context, scan_result)

742

743

744class HorizontalReporter(DetailReporter):

745

746 class Template:

747

748 DIFFERENT_COLUMN_GUIDE = 'Column indices with difference'

749 PREFIX_of_DIFF_COLUMNS = ' @ '

750

751 def __init__(self, lhs_max_row_number_length, lhs_max_row_length, rhs_max_row_number_length, rhs_max_row_length):

752

753 self.lhs_max_row_number_length = lhs_max_row_number_length

754 self.lhs_filler_length = 1

755 self.lhs_max_row_length = lhs_max_row_length

756 self.diff_mark_filler_length_in_front = 2

757 self.diff_mark_length = 1

758 self.diff_mark_filler_length_in_rear = 2

759 self.rhs_max_row_number_length = rhs_max_row_number_length

760 self.rhs_filler_length = 1

761 self.rhs_max_row_length = rhs_max_row_length

762 self.prefix_length_for_diff_columns_displays = len(self.PREFIX_of_DIFF_COLUMNS)

763

764 self.lhs_length = self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length

765 self.diff_mark_length = self.diff_mark_filler_length_in_front + self.diff_mark_length + self.diff_mark_filler_length_in_rear

766 self.rhs_length = self.rhs_max_row_number_length + self.rhs_filler_length + self.rhs_max_row_length

767

768

769 # --- heading-related description ---

770

771 def division_string(self):

772 return '-' * (self.lhs_length + self.diff_mark_length + self.rhs_length + self.prefix_length_for_diff_columns_displays + len(self.DIFFERENT_COLUMN_GUIDE))

773

774 def file_name_description(self, lhs_file_name, rhs_file_name):

775

776 lhs_file_name = lhs_file_name.ljust(self.lhs_length)

777 diff_mark_spacing = ' ' * self.diff_mark_length

778 rhs_file_name = rhs_file_name.ljust(self.rhs_length)

779 prefix_length_spacing = ' ' * self.prefix_length_for_diff_columns_displays

780 different_column_guide = self.DIFFERENT_COLUMN_GUIDE

781 return '%(lhs_file_name)s%(diff_mark_spacing)s%(rhs_file_name)s%(prefix_length_spacing)s%(different_column_guide)s' % locals()

782

783

784 # --- left-hand side related description ---

785

786 def lhs_only_description(self, lhs_fact):

787

788 lhs = self._lhs_description(lhs_fact)

789 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.LHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)

790 return '%(lhs)s%(diff_mark_area)s' % locals()

791

792 def _lhs_description(self, lhs_fact):

793

794 lhs_row_number = str(lhs_fact.lhs_row_number).rjust(self.lhs_max_row_number_length)

795 spacing = ' ' * self.lhs_filler_length

796 lhs_row = str(lhs_fact.lhs_row).ljust(self.lhs_max_row_length)

797 return '%(lhs_row_number)s%(spacing)s%(lhs_row)s' % locals()

798

799 def _lhs_empty_description(self):

800 return ' ' * (self.lhs_max_row_number_length + self.lhs_filler_length + self.lhs_max_row_length)

801

802

803 # --- right-hand side related description ---

804

805 def rhs_only_description(self, rhs_fact):

806

807 empty_lhs = self._lhs_empty_description()

808 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + Mark.RHS_ONLY + (' ' * self.diff_mark_filler_length_in_rear)

809 rhs = self._rhs_description(rhs_fact)

810 return '%(empty_lhs)s%(diff_mark_area)s%(rhs)s' % locals()

811

812 def _rhs_description(self, rhs_fact):

813

814 rhs_row_number = str(rhs_fact.rhs_row_number).rjust(self.rhs_max_row_number_length)

815 spacing = ' ' * self.rhs_filler_length

816 rhs_row = str(rhs_fact.rhs_row).ljust(self.rhs_max_row_length)

817 return '%(rhs_row_number)s%(spacing)s%(rhs_row)s' % locals()

818

819

820 # --- both sides related description ---

821

822 def both_description(self, lhs_fact, rhs_fact, value_difference_result):

823

824 lhs = self._lhs_description(lhs_fact)

825 diff_mark = Mark.HAS_DIFF if value_difference_result.has_difference else Mark.NON_DIFF

826 diff_mark_area = (' ' * self.diff_mark_filler_length_in_front) + diff_mark + (' ' * self.diff_mark_filler_length_in_rear)

827 rhs = self._rhs_description(rhs_fact)

828 prefix_of_diff_columns = self.PREFIX_of_DIFF_COLUMNS if value_difference_result.has_difference else ''

829 different_columns = str(value_difference_result.different_column_indices) if value_difference_result.has_difference else ''

830 return '%(lhs)s%(diff_mark_area)s%(rhs)s%(prefix_of_diff_columns)s%(different_columns)s' % locals()

831

832

833 def __init__(self, context, scan_result):

834

835 super(HorizontalReporter, self).__init__(context)

836 self.cxt = context

837

838 if context.needs_size_info_for_padding:

839 size_info = scan_result.size_info_for_padding

840 self.template = HorizontalReporter.Template(len(str(size_info.lhs_max_row_number)),

841 size_info.lhs_max_row_length,

842 len(str(size_info.rhs_max_row_number)),

843 size_info.rhs_max_row_length)

844 else:

845 self.template = None

846

847

848 # --- report heading related ---

849

850 def _report_file_name(self):

851

852 print(self.template.division_string())

853 print(self.template.file_name_description(os.path.basename(self.cxt.lhs_file_name), os.path.basename(self.cxt.rhs_file_name)))

854 print(self.template.division_string())

855

856

857 # --- report each cases ---

858

859 def report_case_of_existed_only_on_lhs(self, lhs_fact):

860

861 if self.cxt.shows_details:

862 print(self.template.lhs_only_description(lhs_fact))

863

864 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

865

866 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:

867 print(self.template.both_description(lhs_fact, rhs_fact, value_difference_result))

868

869 def report_case_of_existed_only_on_rhs(self, rhs_fact):

870

871 if self.cxt.shows_details:

872 print(self.template.rhs_only_description(rhs_fact))

873

874

875class VerticalReporter(DetailReporter):

876

877 class Template:

878

879 LHS_MARK = 'L'

880 RHS_MARK = 'R'

881 PREFIX_of_DIFF_COLUMNS = '@'

882

883 def __init__(self):

884 pass

885

886

887 # --- heading-related description ---

888

889 @classmethod

890 def division_string(cls):

891 return '-' * 80

892

893 @classmethod

894 def file_name_description(cls, mark, file_name):

895 return mark + ' ' + file_name

896

897

898 # --- left-hand side related description ---

899

900 @classmethod

901 def lhs_only_description(cls, lhs_fact):

902 return Mark.LHS_ONLY + ' ' + cls.LHS_MARK + ' ' + str(lhs_fact.lhs_row_number) + ' ' + str(lhs_fact.lhs_row)

903

904

905 # --- right-hand side related description ---

906

907 @classmethod

908 def rhs_only_description(cls, rhs_fact):

909 return Mark.RHS_ONLY + ' ' + cls.RHS_MARK + ' ' + str(rhs_fact.rhs_row_number) + ' ' + str(rhs_fact.rhs_row)

910

911

912 # --- both sides related description ---

913

914 @classmethod

915 def both_description_heading(cls, value_difference_result):

916

917 if value_difference_result.has_difference:

918 return Mark.HAS_DIFF + ' ' + cls.PREFIX_of_DIFF_COLUMNS + ' ' + str(value_difference_result.different_column_indices)

919 else:

920 return Mark.NON_DIFF_EXPRESSLY

921

922 @classmethod

923 def both_description_lhs(cls, lhs_fact, row_number_length):

924 return ' ' + cls.LHS_MARK + ' ' + str(lhs_fact.lhs_row_number).rjust(row_number_length) + ' ' + str(lhs_fact.lhs_row)

925

926 @classmethod

927 def both_description_rhs(cls, rhs_fact, row_number_length):

928 return ' ' + cls.RHS_MARK + ' ' + str(rhs_fact.rhs_row_number).rjust(row_number_length) + ' ' + str(rhs_fact.rhs_row)

929

930

931

932 def __init__(self, context, _):

933

934 super(VerticalReporter, self).__init__(context)

935 self.cxt = context

936 self.template = VerticalReporter.Template()

937

938

939 # --- report heading related ---

940

941 def _report_file_name(self):

942

943 print(self.template.division_string())

944 print(self.template.file_name_description(self.template.LHS_MARK, os.path.basename(self.cxt.lhs_file_name)))

945 print(self.template.file_name_description(self.template.RHS_MARK, os.path.basename(self.cxt.rhs_file_name)))

946 print(self.template.division_string())

947

948

949 # --- report each cases ---

950

951 def report_case_of_existed_only_on_lhs(self, lhs_fact):

952

953 if self.cxt.shows_details:

954 print(self.template.lhs_only_description(lhs_fact))

955

956 def report_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

957

958 if (self.cxt.shows_difference_only and value_difference_result.has_difference) or self.cxt.shows_all_lines:

959

960 row_number_length = max(len(str(lhs_fact.lhs_row_number)), len(str(rhs_fact.rhs_row_number)))

961

962 print(self.template.both_description_heading(value_difference_result))

963 print(self.template.both_description_lhs(lhs_fact, row_number_length))

964 print(self.template.both_description_rhs(rhs_fact, row_number_length))

965

966 def report_case_of_existed_only_on_rhs(self, rhs_fact):

967

968 if self.cxt.shows_details:

969 print(self.template.rhs_only_description(rhs_fact))

970

971

972class CountReporter:

973

974 class Counter:

975

976 def __init__(self):

977

978 self.number_of_same_lines = 0

979 self.number_of_lhs_only = 0

980 self.number_of_rhs_only = 0

981 self.number_of_differences = 0

982

983 self.row_numbers_for_lhs_only = []

984 self.row_numbers_for_rhs_only = []

985 self.row_numbers_for_differences = {}

986

987 self._max_digit = None

988

989 def _increment_same_lines(self):

990 self.number_of_same_lines += 1

991

992 def _increment_lhs_only(self):

993 self.number_of_lhs_only += 1

994

995 def _increment_rhs_only(self):

996 self.number_of_rhs_only += 1

997

998 def _increment_differences(self):

999 self.number_of_differences += 1

1000

1001 def _add_row_number_for_lhs_only(self, row_number):

1002 self.row_numbers_for_lhs_only.append(row_number)

1003

1004 def _add_row_number_for_rhs_only(self, row_number):

1005 self.row_numbers_for_rhs_only.append(row_number)

1006

1007 def _add_row_number_for_differences(self, lhs_row_number, rhs_row_number):

1008 self.row_numbers_for_differences[lhs_row_number] = rhs_row_number

1009

1010

1011 def count_for_case_of_existed_only_on_lhs(self, row_number):

1012 self._increment_lhs_only()

1013 self._add_row_number_for_lhs_only(row_number)

1014

1015 def count_for_case_of_existed_on_both_sides(self, lhs_fact, rhs_fact, value_difference_result):

1016

1017 if value_difference_result.has_difference:

1018 self._increment_differences()

1019 self._add_row_number_for_differences(lhs_fact.lhs_row_number, rhs_fact.rhs_row_number)

1020 else:

1021 self._increment_same_lines()

1022

1023 def count_for_case_of_existed_only_on_rhs(self, row_number):

1024 self._increment_rhs_only()

1025 self._add_row_number_for_rhs_only(row_number)

1026

1027 @property

1028 def sorted_row_numbers_for_differences(self):

1029 return sorted(self.row_numbers_for_differences.items(), key=lambda x: x[0])

1030

1031

1032 @property

1033 def max_digit(self):

1034

1035 if self._max_digit is not None:

1036 return self._max_digit

1037

1038 self._max_digit = max(

1039 len(str(self.number_of_same_lines)),

1040 len(str(self.number_of_lhs_only)),

1041 len(str(self.number_of_rhs_only)),

1042 len(str(self.number_of_differences)),

1043 )

1044 return self._max_digit

1045

1046

1047 def __init__(self, shows_count):

1048 self.shows_count = shows_count

1049 self.counter = self.Counter()

1050

1051

1052 def _func_of_right_justified_number(self):

1053 return lambda number: str(number).rjust(self.counter.max_digit)

1054

1055 @spacing_before(1)

1056 def report_count(self):

1057

1058 if not self.shows_count:

1059 return

1060

1061 print('* Count & Row number')

1062

1063 rjust = self._func_of_right_justified_number()

1064 print('same lines : {}'.format(rjust(self.counter.number_of_same_lines)))

1065 print('left side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.LHS_ONLY, rjust(self.counter.number_of_lhs_only), self.counter.row_numbers_for_lhs_only))

1066 print('right side only ({}): {} :-- Row Numbers -->: {}'.format(Mark.RHS_ONLY, rjust(self.counter.number_of_rhs_only), self.counter.row_numbers_for_rhs_only))

1067 print('with differences ({}): {} :-- Row Number Pairs -->: {}'.format(Mark.HAS_DIFF, rjust(self.counter.number_of_differences), self.counter.sorted_row_numbers_for_differences))

1068

1069

1070# ----------------------------------------------------------------------------------------------------------------------

1071# CSV Reading

1072# ----------------------------------------------------------------------------------------------------------------------

1073

1074class FileArrangement(type):

1075

1076 LHS = '_for_lhs'

1077 RHS = '_for_rhs'

1078

1079

1080class CsvDialectFixer:

1081

1082 def __init__(self):

1083 pass

1084

1085 @classmethod

1086 def fixed_dialect(cls, context, csv_file, file_arrangement):

1087

1088 if context.forces_individual_specs:

1089 return cls._dialect_from_context(context, file_arrangement)

1090 else:

1091 return cls._try_sniffing(context, csv_file, file_arrangement)

1092

1093

1094 @classmethod

1095 def _dialect_from_context(cls, context, file_arrangement):

1096

1097 dialect = csv.excel()

1098 dialect.delimiter = getattr(context, "column_separator" + file_arrangement)

1099 dialect.lineterminator = getattr(context, "line_separator" + file_arrangement)

1100 dialect.quotechar = getattr(context, "quote_char" + file_arrangement)

1101 dialect.skipinitialspace = getattr(context, "skips_space_after_column_separator" + file_arrangement)

1102

1103 return dialect, context

1104

1105 @classmethod

1106 def _try_sniffing(cls, context, csv_file, file_arrangement):

1107

1108 try:

1109 return cls._sniff(context, csv_file, file_arrangement)

1110

1111 except csv.Error as e:

1112

1113 logger.warning('Sniffing failed. Generated a dialect from context instead. [type={}, args={}, message={}]'.format(type(e), str(e.args), e.message))

1114 return cls._dialect_from_context(context, file_arrangement)

1115

1116 finally:

1117 csv_file.seek(0)

1118

1119 @classmethod

1120 def _sniff(cls, context, csv_file, file_arrangement):

1121

1122 sample = csv_file.read(context.sniffing_size)

1123 sniffer = csv.Sniffer()

1124 dialect = sniffer.sniff(sample)

1125 has_header = sniffer.has_header(sample)

1126

1127 adjusted_context = cls._adjust_context_with(dialect, has_header, context, file_arrangement)

1128

1129 return dialect, adjusted_context

1130

1131 @classmethod

1132 def _adjust_context_with(cls, dialect, has_header, context, file_arrangement):

1133

1134 setattr(context, "column_separator" + file_arrangement, dialect.delimiter)

1135 setattr(context, "line_separator" + file_arrangement, dialect.lineterminator)

1136 setattr(context, "quote_char" + file_arrangement, dialect.quotechar)

1137 setattr(context, "skips_space_after_column_separator" + file_arrangement, dialect.skipinitialspace)

1138 context.first_row_is_header = has_header if context.header is None else (True if context.header == 'y' else False)

1139

1140 return context

1141

1142

1143def debug_log_dialect(dialect, context, message):

1144

1145 logger.debug('---{}---'.format(message))

1146 logger.debug('sniffing dialect={}'.format(dialect))

1147 logger.debug('sniffing dialect csv.excel={}'.format(isinstance(dialect, csv.excel)))

1148 logger.debug('sniffing dialect csv.excel_tab={}'.format(isinstance(dialect, csv.excel_tab)))

1149 logger.debug('sniffing dialect.delimiter={}'.format(dialect.delimiter.encode('hex')))

1150 logger.debug('sniffing dialect.doublequote={}'.format(dialect.doublequote))

1151 logger.debug('sniffing dialect.escapechar={}'.format(dialect.escapechar))

1152 logger.debug('sniffing dialect.lineterminator={}'.format(dialect.lineterminator.encode('hex')))

1153 logger.debug('sniffing dialect.quotechar={}'.format(dialect.quotechar))

1154 logger.debug('sniffing dialect.quoting={}'.format(dialect.quoting))

1155 logger.debug('sniffing dialect.skipinitialspace={}'.format(dialect.skipinitialspace))

1156

1157

1158class LhsFact:

1159

1160 def __init__(self, lhs_row_number, lhs_row, lhs_key):

1161

1162 logger.debug('LhsFact 生成 lhs_row_number={}, lhs_row={}, lhs_key={}'.format(lhs_row_number, lhs_row, lhs_key))

1163

1164 self.lhs_row_number = lhs_row_number

1165 self.lhs_row = lhs_row

1166 self.lhs_key = lhs_key

1167

1168

1169class RhsFact:

1170

1171 def __init__(self, rhs_row_number, rhs_row, rhs_key):

1172

1173 logger.debug('RhsFact 生成 rhs_row_number={}, rhs_row={}, rhs_key={}'.format(rhs_row_number, rhs_row, rhs_key))

1174

1175 self.rhs_row_number = rhs_row_number

1176 self.rhs_row = rhs_row

1177 self.rhs_key = rhs_key

1180class CsvReader:

1182 class State:

1184 def __init__(self, csv_file, dialect, file_name, first_row_is_header):

1185

1186 self._csv_file = csv_file

1187 self._dialect = dialect

1188 self._file_name = file_name

1189 self._first_row_is_header = first_row_is_header

1190

1191 self._csv_reader = csv.reader(csv_file, dialect)

1192 self._row_number = 0

1193 self._previous_key = ""

1194

1195 def reset(self):

1196

1197 self._csv_file.seek(0)

1198 self._csv_reader = csv.reader(self._csv_file, self._dialect)

1199 self._row_number = 0

1200 self._previous_key = ""

1201

1202 def increment_row_number(self):

1203

1204 if self._previous_key == MatchingKeyCodec.END_of_KEY:

1205 return

1206

1207 self._row_number += 1

1208

1209 def key_changed(self, new_key):

1210

1211 if self._is_header():

1212 return

1213

1214 self._previous_key = new_key

1215

1216 def _is_header(self):

1217 return self.row_number == 0 and self._first_row_is_header

1218

1219 @property

1220 def csv_reader(self):

1221 return self._csv_reader

1222

1223 @property

1224 def file_name(self):

1225 return self._file_name

1226

1227 @property

1228 def row_number(self):

1229 return self._row_number

1230

1231 @property

1232 def previous_key(self):

1233 return self._previous_key

1234

1235

1236 def __init__(self, lhs_csv, rhs_csv, lhs_dialect, rhs_dialect, context):

1237

1238 debug_log_dialect(lhs_dialect, context, '左CSV')

1239 debug_log_dialect(rhs_dialect, context, '右CSV')

1240

1241 self.lhs_csv_state = CsvReader.State(lhs_csv, lhs_dialect, context.lhs_file_name, context.first_row_is_header)

1242 self.rhs_csv_state = CsvReader.State(rhs_csv, rhs_dialect, context.rhs_file_name, context.first_row_is_header)

1243 self.cxt = context

1244

1245 self.skip_header()

1246

1247 def skip_header(self):

1248

1249 if self.cxt.first_row_is_header:

1250 _ = self.read_lhs()

1251 _ = self.read_rhs()

1252

1253 def reset(self):

1254

1255 self.lhs_csv_state.reset()

1256 self.rhs_csv_state.reset()

1257 self.skip_header()

1258

1259 def read_lhs(self):

1260

1261 lhs_row, lhs_key = self._read_csv(self.lhs_csv_state)

1262 self.lhs_csv_state.increment_row_number()

1263 return LhsFact(self.lhs_csv_state.row_number, lhs_row, lhs_key)

1264

1265 def read_rhs(self):

1266

1267 rhs_row, rhs_key = self._read_csv(self.rhs_csv_state)

1268 self.rhs_csv_state.increment_row_number()

1269 return RhsFact(self.rhs_csv_state.row_number, rhs_row, rhs_key)

1270

1271 def _read_csv(self, csv_state):

1272

1273 try:

1274 row = next(csv_state.csv_reader)

1275 except StopIteration:

1276 csv_state.key_changed(MatchingKeyCodec.END_of_KEY)

1277 return [], MatchingKeyCodec.END_of_KEY

1278

1279 new_key = self.cxt.matching_key_codec.managed_key_for(row)

1280 self._detect_key_violation(new_key, csv_state)

1281

1282 csv_state.key_changed(new_key)

1283

1284 return row, new_key

1285

1286 def _detect_key_violation(self, new_key, csv_state):

1287

1288 if csv_state.previous_key == '':

1289 return

1290

1291 if new_key < csv_state.previous_key:

1292 logger.error('matching keys in {} are not sorted.'

1293 ' [current_key={}, previous_key={}, matching-key-indices={}] If the key is a number without zero padding, specify the max size of the key after colon like -k0:8.'.format(

1294 csv_state.file_name, MatchingKeyCodec.decode_key(new_key), MatchingKeyCodec.decode_key(csv_state.previous_key), self.cxt.matching_key_codec.matching_key_info_list))

1295 exit(1)

1296

1297 if self.cxt.key_should_be_unique and new_key == csv_state.previous_key:

1298 logger.error('matching keys in {} are not unique.'

1299 ' [current_key={}, previous_key={}, matching-key-indices={}]'.format(

1300 csv_state.file_name, MatchingKeyCodec.decode_key(new_key), MatchingKeyCodec.decode_key(csv_state.previous_key), self.cxt.matching_key_codec.matching_key_info_list))

1301 exit(1)

1302

1303

1304if __name__ == '__main__':

1305

1306 main()

1307

Coverage for src/csvdiff2/csvdiff.py : 96%

712 statements

Coverage for src/csvdiff2/csvdiff.py : 96%

712 statements 685 run 27 missing 0 excluded

712 statements