Coverage for sparc/docparser.py: 83%

369 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-12 01:13 +0000

1# -*- coding: utf-8 -*- 

2""" 

3A module to parse the latex documents provided by SPARC 

4and convert to its Python API 

5 

6Created on Wed Mar 1 15:32:31 EST 2023 

7 

8Tian Tian (alchem0x2a@gmail.com) 

9""" 

10import json 

11import re 

12from copy import copy 

13from datetime import datetime 

14from pathlib import Path 

15from warnings import warn 

16 

17import numpy as np 

18 

19# Some fields in master SPARC doc may cause auto type detection 

20# to fail, need hard-coded post-processing for now 

21postprocess_items = { 

22 "RELAX_FLAG": {"allow_bool_input": False}, 

23 "NPT_SCALE_CONSTRAINTS": {"type": "string"}, 

24 "NPT_SCALE_VECS": {"type": "integer array"}, 

25 "TOL_POISSON": {"type": "double"}, 

26} 

27 

28sparc_repo_url = "https://github.com/SPARC-X/SPARC.git" 

29 

30 

31class SparcDocParser(object): 

32 """Parses LaTeX documentation of SPARC-X and converts it into a Python API. 

33 

34 This class extracts parameter information from LaTeX source files, 

35 organizing it into a structured format that can be easily used in 

36 Python. It supports parsing of version details, parameter types, 

37 units, and other relevant information. 

38 

39 Attributes: 

40 version (str): Parsed SPARC version, based on the documentation. 

41 parameter_categories (list): Categories of parameters extracted. 

42 parameters (dict): Extracted parameters with detailed information. 

43 other_parameters (dict): Additional parameters not categorized. 

44 

45 Methods: 

46 find_main_file(main_file_pattern): Finds the main LaTeX file based on a pattern. 

47 get_include_files(): Retrieves a list of included LaTeX files. 

48 parse_version(parse): Parses and sets the SPARC version. 

49 parse_parameters(): Extracts parameters from LaTeX files. 

50 postprocess(): Applies hard-coded post-processing to some parameters. 

51 to_dict(): Converts parsed information into a dictionary. 

52 json_from_directory(directory, include_subdirs, **kwargs): Class method to create JSON from a directory. 

53 json_from_repo(url, version, include_subdirs, **kwargs): Class method to create JSON from a repository. 

54 

55 """ 

56 

57 def __init__( 

58 self, 

59 directory=".", 

60 main_file="*Manual.tex", 

61 intro_file="Introduction.tex", 

62 params_from_intro=True, 

63 parse_version=True, 

64 ): 

65 """Create the doc parser pointing to the root of the doc file of SPARC 

66 

67 The SPARC doc is organized as follows: 

68 SPARC/doc/.LaTeX/ 

69 |---- Manual.tex 

70 |---- Introduction.tex 

71 |---- {Section}.tex 

72 

73 For parameters additional to the standard SPARC options, such as the SQ / cyclix 

74 options, we merge the dict from the sub-dirs 

75 

76 Args: 

77 doc_root: root directory to the LaTeX files, may look like `SPARC/doc/.LaTeX` 

78 main_file: main LaTeX file for the manual 

79 intro_file: LaTeX file for the introduction 

80 params_from_intro: only contain the parameters that can be parsed in `intro_file` 

81 parse_date: get the SPARC version by date 

82 """ 

83 self.root = Path(directory) 

84 self.main_file = self.find_main_file(main_file) 

85 self.intro_file = self.root / intro_file 

86 if not self.intro_file.is_file(): 

87 raise FileNotFoundError(f"Introduction file {intro_file} is missing!") 

88 self.include_files = self.get_include_files() 

89 self.params_from_intro = params_from_intro 

90 self.parse_version(parse_version) 

91 self.parse_parameters() 

92 self.postprocess() 

93 

94 def find_main_file(self, main_file_pattern): 

95 """ 

96 Finds the main LaTeX file that matches the given pattern, e.g. Manual.tex or Manual_cyclix.te 

97 

98 Args: 

99 main_file_pattern (str): Pattern to match the main LaTeX file name. 

100 

101 Returns: 

102 Path: Path to the main LaTeX file. 

103 

104 Raises: 

105 FileNotFoundError: If no or multiple files match the pattern. 

106 """ 

107 candidates = list(self.root.glob(main_file_pattern)) 

108 if len(candidates) != 1: 

109 raise FileNotFoundError( 

110 f"Main file {main_file_pattern} is missing or more than 1 exists!" 

111 ) 

112 return candidates[0] 

113 

114 def get_include_files(self): 

115 """ 

116 Retrieves a list of LaTeX files included in the main LaTeX document, e.g. Manual.tex. 

117 

118 Returns: 

119 list: A list of paths to the included LaTeX files. 

120 """ 

121 pattern = r"\\begin\{document\}(.*?)\\end\{document\}" 

122 text = open(self.main_file, "r", encoding="utf8").read() 

123 # Only the first begin/end document will be matched 

124 match = re.findall(pattern, text, re.DOTALL)[0] 

125 pattern_include = r"\\include\{(.+?)\}" 

126 include = re.findall(pattern_include, match, re.DOTALL) 

127 include_files = [] 

128 for name in include: 

129 tex_file = self.root / f"{name}.tex" 

130 if tex_file.is_file(): 

131 include_files.append(tex_file) 

132 else: 

133 warn( 

134 ( 

135 f"TeX file {tex_file} is missing! It may be a typo in the document, " 

136 "ignore parameters from this file." 

137 ) 

138 ) 

139 return include_files 

140 

141 def parse_version(self, parse=True): 

142 """ 

143 Parses and sets the SPARC version based on the C-source file, if possible. 

144 The date for the SPARC code is parsed from initialization.c in the "YYYY.MM.DD" 

145 format. 

146 

147 Args: 

148 parse (bool): Whether to parse the version from the documentation. 

149 

150 Sets: 

151 self.version (str): The parsed version in 'YYYY.MM.DD' format or None, 

152 if either parse=False, or the C-source code is missing 

153 """ 

154 if parse is False: 

155 self.version = None 

156 return 

157 init_c = self.root.parents[1] / "src" / "initialization.c" 

158 if not init_c.is_file(): 

159 warn( 

160 'Cannot find the c source file "initialization.c", skip version parsing!' 

161 ) 

162 self.version = None 

163 return 

164 text = open(init_c, "r", encoding="utf8").read() 

165 pattern_version = r"SPARC\s+\(\s*?version(.*?)\)" 

166 match = re.findall(pattern_version, text) 

167 if len(match) != 1: 

168 warn( 

169 'Parsing c source file "initialization.c" for version is unsuccessful!' 

170 ) 

171 self.version = None 

172 return 

173 # We need to add more spacing matching in case the source code includes extra 

174 date_str = re.sub(r"\s+", " ", match[0].strip().replace(",", " ")) 

175 # Older version of SPARC doc may contain abbreviated month format 

176 date_version = None 

177 for fmt in ("%b %d %Y", "%B %d %Y"): 

178 try: 

179 date_version = datetime.strptime(date_str, fmt).strftime("%Y.%m.%d") 

180 break 

181 except Exception: 

182 continue 

183 if date_version is None: 

184 raise ValueError(f"Cannot parse date time {date_str}") 

185 self.version = date_version 

186 return 

187 

188 def __parse_parameter_from_frame(self, frame): 

189 """Parse the parameters from a single LaTeX frame 

190 

191 Args: 

192 frame (str): a string containing the LaTeX frame (e.g. \\begin{frame} ... \\end{frame}) 

193 

194 Returns: 

195 dict: a key-value paired dict parsed from the frame. Some field names include: 

196 name: TOL_POISSON 

197 type: Double | Integer | String | Character | Double array 

198 unit: specified in the doc 

199 """ 

200 pattern_label = r"\\texttt\{(.*?)\}.*?\\label\{(.*?)\}" 

201 pattern_block = r"\\begin\{block\}\{(.*?)\}([\s\S]*?)\\end\{block\}" 

202 match_label = re.findall(pattern_label, frame, re.DOTALL | re.MULTILINE) 

203 if len(match_label) != 1: 

204 warn("Provided a non-structured frame for parsing, skip.") 

205 return {} 

206 symbol, label = ( 

207 convert_tex_parameter(match_label[0][0].strip()), 

208 match_label[0][1].strip(), 

209 ) 

210 # Every match contains the (name, content) pair of the blocks 

211 matches = re.findall(pattern_block, frame, re.DOTALL | re.MULTILINE) 

212 param_dict = {"symbol": symbol, "label": label} 

213 # TODO: add more type definition 

214 for key, content in matches: 

215 key = key.lower() 

216 content = content.strip() 

217 # Do not parse commented-out values 

218 

219 if (key == "type") and (content.startswith("%")): 

220 warn(f"Parameter {symbol} is disabled in the doc, ignore!") 

221 return {} 

222 if key in ("example",): 

223 content = convert_tex_example(content) 

224 param_dict[key] = content 

225 # Sanitize 1: Convert types 

226 param_dict = sanitize_type(param_dict) 

227 # Sanitize 2: Convert default values 

228 param_dict = sanitize_default(param_dict) 

229 # Sanitize 3: Remove TeX components in description and remark 

230 param_dict = sanitize_description(param_dict) 

231 

232 return param_dict 

233 

234 def __parse_frames_from_text(self, text): 

235 """Extract all the frames that aren't commented in the text 

236 

237 Arguments: 

238 text (str): Full LaTeX text 

239 Returns: 

240 list: Matched LaTeX Beamer frame fragments 

241 """ 

242 pattern_frame = r"\\begin\{frame\}(.*?)\\end\{frame\}" 

243 matches = re.findall(pattern_frame, text, re.DOTALL | re.MULTILINE) 

244 return matches 

245 

246 def __parse_intro_file(self): 

247 """Parse the introduction file 

248 

249 Returns: 

250 parameter_dict (dict): dictionary using the parameter category as the main key 

251 (following order in Introduction.tex) 

252 parameter_categories (list): list of categories 

253 """ 

254 text_intro = open(self.intro_file, "r", encoding="utf8").read() 

255 pattern_params = ( 

256 r"^\\begin\{frame\}.*?\{Input file options\}.*?$(.*?)\\end\{frame\}" 

257 ) 

258 pattern_block = r"\\begin\{block\}\{(.*?)\}([\s\S]*?)\\end\{block\}" 

259 pattern_line = r"\\hyperlink\{(.*?)\}{\\texttt\{(.*?)\}\}" 

260 text_params = re.findall(pattern_params, text_intro, re.DOTALL | re.MULTILINE)[ 

261 0 

262 ] 

263 parameter_categories = [] 

264 parameter_dict = {} 

265 for match in re.findall(pattern_block, text_params): 

266 cat = match[0].lower() 

267 # print(cat) 

268 if cat in parameter_categories: 

269 raise ValueError( 

270 f"Key {cat} already exists! You might have a wrong LaTeX doc file!" 

271 ) 

272 parameter_categories.append(cat) 

273 parameter_dict[cat] = [] 

274 param_lines = match[1].split("\n") 

275 for line in param_lines: 

276 matches = re.findall(pattern_line, line) 

277 if len(matches) == 0: 

278 continue 

279 # Each match should contain 2 items, the "Link" that matches a reference in included-tex files 

280 # symbol is the actual symbol name (in text-format) 

281 # In most cases the link and symbol should be the same 

282 for match in matches: 

283 label, symbol = match[0].strip(), convert_tex_parameter( 

284 match[1].strip() 

285 ) 

286 parameter_dict[cat].append({"label": label, "symbol": symbol}) 

287 return parameter_categories, parameter_dict 

288 

289 def __parse_all_included_files(self): 

290 """Pop up all known parameters from included files 

291 Returns: 

292 dict: All known parameters from included files 

293 """ 

294 all_params = {} 

295 for f in self.include_files: 

296 # Do not parse intro file since it's waste of time 

297 if f.resolve() == self.intro_file.resolve(): 

298 continue 

299 text = open(f, "r", encoding="utf8").read() 

300 frames = self.__parse_frames_from_text(text) 

301 for frame in frames: 

302 dic = self.__parse_parameter_from_frame(frame) 

303 if len(dic) > 0: 

304 label = dic["label"] 

305 all_params[label] = dic 

306 return all_params 

307 

308 def parse_parameters(self): 

309 """The actual thing for parsing parameters 

310 

311 Sets: 

312 parameters (dict): All parsed parameters 

313 parameter_categoris (list): List of categories 

314 other_parameters (dict): Any parameters that are not included in the categories 

315 """ 

316 parameter_categories, parameter_dict = self.__parse_intro_file() 

317 all_params = self.__parse_all_included_files() 

318 self.parameter_categories = parameter_categories 

319 # parameters contain only the "valid" ones that are shown in the intro 

320 # all others are clustered in "other_parameters" 

321 self.parameters = {} 

322 for cat, params in parameter_dict.items(): 

323 for p in params: 

324 label = p["label"] 

325 symbol = p["symbol"] 

326 param_details = all_params.pop(label, {}) 

327 if param_details != {}: 

328 param_details["category"] = cat 

329 self.parameters[symbol] = param_details 

330 

331 self.other_parameters = {} 

332 for param_details in all_params.values(): 

333 symbol = param_details["symbol"] 

334 self.other_parameters[symbol] = param_details 

335 return 

336 

337 def postprocess(self): 

338 """Use the hardcoded dict prostprocess_items to fix some issues""" 

339 for param, fix in postprocess_items.items(): 

340 if param in self.parameters: 

341 self.parameters[param].update(**fix) 

342 return 

343 

344 def to_dict(self): 

345 """Output a json dict from current document parser 

346 

347 Returns: 

348 dict: All API schemes in dict 

349 """ 

350 doc = {} 

351 doc["sparc_version"] = self.version 

352 doc["categories"] = self.parameter_categories 

353 doc["parameters"] = {k: v for k, v in sorted(self.parameters.items())} 

354 doc["other_parameters"] = { 

355 k: v for k, v in sorted(self.other_parameters.items()) 

356 } 

357 doc["data_types"] = sorted(set([p["type"] for p in self.parameters.values()])) 

358 return doc 

359 

360 @classmethod 

361 def json_from_directory(cls, directory=".", include_subdirs=True, **kwargs): 

362 """ 

363 Recursively add parameters from all Manual files 

364 Arguments: 

365 directory (str or PosixPath): The directory to the LaTeX files, e.g. <sparc-root>/doc/.LaTeX 

366 include_subdirs (bool): If true, also parse the manual files in submodules, e.g. cyclix, highT 

367 Returns: 

368 str: Formatted json-string of the API 

369 """ 

370 directory = Path(directory) 

371 root_dict = cls(directory=directory, **kwargs).to_dict() 

372 if include_subdirs: 

373 for sub_manual_tex in directory.glob("*/*Manual.tex"): 

374 subdir = sub_manual_tex.parent 

375 try: 

376 sub_dict = cls(directory=subdir, parse_version=False).to_dict() 

377 except FileNotFoundError: 

378 print( 

379 subdir, 

380 " Latex files not found. Check naming conventions for Manual.tex. Expects format *Manual.tex", 

381 ) 

382 continue 

383 for param, param_desc in sub_dict["parameters"].items(): 

384 if param not in root_dict["parameters"]: 

385 root_dict["parameters"][param] = param_desc 

386 # Combine the subdir categories 

387 for sub_category in sub_dict["categories"]: 

388 if sub_category not in root_dict["categories"]: 

389 root_dict["categories"].append(sub_category) 

390 # Combine data types 

391 for sub_dt in sub_dict["data_types"]: 

392 if sub_dt not in root_dict["data_types"]: 

393 root_dict["data_types"].append(sub_dt) 

394 

395 json_string = json.dumps(root_dict, indent=True) 

396 return json_string 

397 

398 @classmethod 

399 def json_from_repo( 

400 cls, url=sparc_repo_url, version="master", include_subdirs=True, **kwargs 

401 ): 

402 """ 

403 Download the source code from git and use json_from_directory to parse 

404 Arguments: 

405 url (str): URL for the repository of SPARC, default is "https://github.com/SPARC-X/SPARC.git" 

406 version (str): Git version or commit hash of the SPARC repo 

407 include_subdirs (bool): If true, also parse the manual files in submodules, e.g. cyclix, highT 

408 Returns: 

409 str: Formatted json-string of the API 

410 """ 

411 import tempfile 

412 from subprocess import run 

413 

414 with tempfile.TemporaryDirectory() as tmpdir: 

415 tmpdir = Path(tmpdir) 

416 download_dir = tmpdir / "SPARC" 

417 download_cmds = ["git", "clone", "--depth", "1", str(url), "SPARC"] 

418 run(download_cmds, cwd=tmpdir) 

419 if version not in ["master", "HEAD"]: 

420 fetch_cmds = ["git", "fetch", "--depth", "1", str(version)] 

421 run(fetch_cmds, cwd=download_dir) 

422 checkout_cmds = ["git", "checkout", str(version)] 

423 run(checkout_cmds, cwd=download_dir) 

424 json_string = cls.json_from_directory( 

425 directory=download_dir / "doc" / ".LaTeX", 

426 include_subdirs=include_subdirs, 

427 **kwargs, 

428 ) 

429 return json_string 

430 

431 

432def convert_tex_parameter(text): 

433 """ 

434 Conver a TeX string to non-escaped name (for parameter only) 

435 Arguments: 

436 text (str): Parameter name in LaTeX format 

437 Returns: 

438 str: Text with sanitized parameter 

439 """ 

440 return text.strip().replace("\_", "_") 

441 

442 

443def convert_tex_example(text): 

444 """Convert TeX codes of examples as much as possible 

445 The examples follow the format 

446 SYMBOL: values (may contain new lines) 

447 Arguments: 

448 text (str): Single or multiline LaTeX contents 

449 Returns: 

450 str: Sanitized literal text 

451 """ 

452 mapper = {"\\texttt{": "", "\_": "_", "}": "", "\\": "\n"} 

453 new_text = copy(text) 

454 for m, r in mapper.items(): 

455 new_text = new_text.replace(m, r) 

456 

457 symbol, values = new_text.split(":") 

458 symbol = symbol.strip() 

459 values = re.sub("\n+", "\n", values.strip()) 

460 # Remove all comment lines 

461 values = "\n".join( 

462 [l for l in values.splitlines() if not l.lstrip().startswith("%")] 

463 ) 

464 new_text = f"{symbol}: {values}" 

465 return new_text 

466 

467 

468def convert_tex_default(text, desired_type=None): 

469 """Convert default values as much as possible. 

470 The desire type will convert the default values 

471 to the closest format 

472 

473 Currently supported conversions 

474 1. Remove all surrounding text modifiers (texttt) 

475 2. Remove all symbol wrappers $ 

476 3. Convert value to single or array 

477 

478 Arguments: 

479 text (str): Raw text string for value 

480 desired_type (str or None): Data type to be converted to. If None, preserve the string format 

481 

482 Returns: 

483 converted: Value converted from raw text 

484 """ 

485 mapper = { 

486 "\\texttt{": "", 

487 "}": "", 

488 "{": "", 

489 "\\_": "_", 

490 "\_": "_", 

491 "\\\\": "\n", 

492 "$": "", 

493 } 

494 text = text.strip() 

495 text = re.sub(r"\\hyperlink\{.*?\}", "", text) 

496 text = re.sub(r"\\times", "x", text) 

497 for m, r in mapper.items(): 

498 text = text.replace(m, r) 

499 text = re.sub(r"\n+", "\n", text) 

500 # Remove all comment lines 

501 text = "\n".join([l for l in text.splitlines() if not l.lstrip().startswith("%")]) 

502 

503 # print(text) 

504 converted = None 

505 if "none" in text.lower(): 

506 converted = None 

507 elif "no default" in text.lower(): 

508 converted = None 

509 elif "automat" in text.lower(): 

510 converted = "auto" 

511 else: 

512 # try type conversion 

513 if desired_type is None: 

514 converted = text 

515 elif desired_type == "string": 

516 converted = text 

517 else: 

518 converted = text2value(text, desired_type) 

519 return converted 

520 

521 

522def convert_comment(text): 

523 """Used to remove TeX-specific commands in description and remarks 

524 as much as possible 

525 

526 Arguments: 

527 text (str): Raw LaTeX code for the comment section in manual 

528 

529 Returns: 

530 str: Sanitized plain text 

531 """ 

532 mapper = { 

533 "\\texttt{": "", 

534 "}": "", 

535 "{": "", 

536 "\\_": "_", 

537 "\_": "_", 

538 "\\\\": "\n", 

539 "$": "", 

540 } 

541 text = text.strip() 

542 text = re.sub(r"\\hyperlink\{.*?\}", "", text) 

543 text = re.sub(r"\\href\{.*?\}", "", text) 

544 text = re.sub(r"\\times", "x", text) 

545 for m, r in mapper.items(): 

546 text = text.replace(m, r) 

547 text = re.sub(r"\n+", "\n", text) 

548 # Remove all comment lines 

549 text = "\n".join([l for l in text.splitlines() if not l.lstrip().startswith("%")]) 

550 return text 

551 

552 

553def text2value(text, desired_type): 

554 """Convert raw text to a desired type 

555 

556 Arguments: 

557 text (str): Text contents for the value 

558 desired_type (str): Target data type from 'string', 'integer', 

559 'integer array', 'double', 'double array', 

560 'bool', 'bool array' 

561 Returns: 

562 converted: Value converted to the desired type 

563 """ 

564 if desired_type is None: 

565 return text 

566 desired_type = desired_type.lower() 

567 if desired_type == "string": 

568 return text.strip() 

569 

570 try: 

571 arr = np.genfromtxt(text.splitlines(), delimiter=" ", dtype=float) 

572 if np.isnan(arr).any(): 

573 warn( 

574 f"Some fields in {text} cannot converted to a numerical array, will skip conversion." 

575 ) 

576 arr = None 

577 except Exception as e: 

578 warn( 

579 f"Cannot transform {text} to array, skip converting. Error message is:\n {e}" 

580 ) 

581 arr = None 

582 

583 if arr is None: 

584 return None 

585 

586 # Upshape ndarray to at least 1D 

587 if arr.shape == (): 

588 arr = np.reshape(arr, [1]) 

589 

590 converted = None 

591 from contextlib import suppress 

592 

593 # Ignore all failures and make conversion None 

594 with suppress(Exception): 

595 if desired_type == "integer": 

596 converted = int(arr[0]) 

597 elif desired_type == "bool": 

598 converted = bool(arr[0]) 

599 elif desired_type == "double": 

600 converted = float(arr[0]) 

601 elif desired_type == "integer array": 

602 converted = np.ndarray.tolist(arr.astype(int)) 

603 elif desired_type == "bool array": 

604 converted = np.ndarray.tolist(arr.astype(bool)) 

605 elif desired_type == "double array": 

606 converted = np.ndarray.tolist(arr.astype(float)) 

607 return converted 

608 

609 

610def is_array(text): 

611 """Simply try to convert a string into a numpy array and compare if length is larger than 1 

612 it is only used to compare a float / int value 

613 """ 

614 val = np.fromstring(text, sep=" ") 

615 if len(val) == 1: 

616 return False 

617 else: 

618 return True 

619 

620 

621def contain_only_bool(text): 

622 """Check if a string only contains 0 1 or spaces""" 

623 if any([c in text for c in (".", "+", "-", "e", "E")]): 

624 return False 

625 digits = re.findall(r"[-+e\d]+", text, re.DOTALL) 

626 for d in digits: 

627 val = int(d) 

628 if val not in (0, 1): 

629 return False 

630 return True 

631 

632 

633def sanitize_description(param_dict): 

634 """Sanitize the description and remark field 

635 

636 Arguments: 

637 param_dict (dict): Raw dict for one parameter entry 

638 

639 Returns: 

640 dict: Sanitized parameter dict with comment, remark and description 

641 converted to human-readable formats 

642 """ 

643 sanitized_dict = param_dict.copy() 

644 

645 original_desc = sanitized_dict["description"] 

646 sanitized_dict["description_raw"] = original_desc 

647 

648 original_remark = sanitized_dict.get("remark", "") 

649 sanitized_dict["remark_raw"] = original_remark 

650 

651 sanitized_dict["description"] = convert_comment(original_desc) 

652 sanitized_dict["remark"] = convert_comment(original_remark) 

653 return sanitized_dict 

654 

655 

656def sanitize_default(param_dict): 

657 """Sanitize the default field 

658 1. Create an extra field `default_remark` that copies original default 

659 2. Use `convert_tex_default` to convert values as much as possible 

660 

661 This function should be called after sanitize_type 

662 """ 

663 sanitized_dict = param_dict.copy() 

664 original_default = sanitized_dict["default"] 

665 sanitized_dict["default_remark"] = original_default 

666 converted_default = convert_tex_default(original_default, param_dict["type"]) 

667 sanitized_dict["default"] = converted_default 

668 return sanitized_dict 

669 

670 

671def sanitize_type(param_dict): 

672 """Sanitize the param dict so that the type are more consistent 

673 

674 For example, if type is Double / Integer, 

675 but parameter is a vector, 

676 make a double vector or integer vector 

677 """ 

678 sanitized_dict = param_dict.copy() 

679 symbol = param_dict["symbol"] 

680 origin_type = param_dict.get("type", None) 

681 if origin_type is None: 

682 print("Dict does not have type!") 

683 return sanitized_dict 

684 origin_type = origin_type.lower() 

685 

686 sanitized_type = None 

687 sanitized_dict["allow_bool_input"] = False 

688 # First pass, remove all singular types 

689 if origin_type == "0 or 1": 

690 origin_type = "integer" 

691 elif "permutation" in origin_type: 

692 sanitized_type = "integer" 

693 elif origin_type in ("string", "character"): 

694 sanitized_type = "string" 

695 elif "array" in origin_type: 

696 sanitized_type = origin_type 

697 

698 # Pass 2, test if int values are arrays 

699 if (origin_type in ["int", "integer", "double"]) and (sanitized_type is None): 

700 if "int" in origin_type: 

701 origin_type = "integer" 

702 # Test if the value from example is a single value or array 

703 try: 

704 example_value = param_dict["example"].split(":")[1] 

705 default = param_dict["default"] 

706 _array_test = is_array(example_value) 

707 _bool_test = contain_only_bool(example_value) and contain_only_bool(default) 

708 except Exception as e: 

709 warn( 

710 f"Array conversion failed for {example_value}, ignore." 

711 f"The error is {e}" 

712 ) 

713 _array_test = False # Retain 

714 

715 if _array_test is True: 

716 sanitized_type = f"{origin_type} array" 

717 else: 

718 sanitized_type = origin_type 

719 

720 # Pass 3: int to boolean test. This should be done very tight 

721 if _bool_test and ("integer" in sanitized_type): 

722 sanitized_dict["allow_bool_input"] = True 

723 

724 if sanitized_type is None: 

725 # Currently there is only one NPT_NH_QMASS has this type 

726 # TODO: think of a way to format a mixed array? 

727 warn(f"Type of {symbol} if not standard digit or array, mark as others.") 

728 sanitized_type = "other" 

729 # TODO: how about provide a true / false type? 

730 sanitized_dict["type"] = sanitized_type 

731 return sanitized_dict 

732 

733 

734if __name__ == "__main__": 

735 # Run the module as independent script to extract a json-formatted parameter list 

736 from argparse import ArgumentParser 

737 

738 argp = ArgumentParser(description="Parse the LaTeX doc to json") 

739 argp.add_argument( 

740 "-o", 

741 "--output", 

742 default="parameters.json", 

743 help="Output file name (json-formatted)", 

744 ) 

745 argp.add_argument( 

746 "--include-subdirs", 

747 action="store_true", 

748 help="Parse manual parameters from subdirs", 

749 ) 

750 argp.add_argument("--git", action="store_true") 

751 argp.add_argument( 

752 "--version", 

753 default="master", 

754 help="Version of the doc. Only works when using git repo", 

755 ) 

756 argp.add_argument( 

757 "root", 

758 nargs="?", 

759 help=( 

760 "Root of the SPARC doc LaTeX files, or remote git repo link. If not provided and --git is enables, use the default github repo" 

761 ), 

762 ) 

763 

764 args = argp.parse_args() 

765 output = Path(args.output).with_suffix(".json") 

766 if args.git: 

767 if args.root is None: 

768 root = sparc_repo_url 

769 else: 

770 root = args.root 

771 json_string = SparcDocParser.json_from_repo( 

772 url=root, version=args.version, include_subdirs=args.include_subdirs 

773 ) 

774 else: 

775 json_string = SparcDocParser.json_from_directory( 

776 directory=Path(args.root), include_subdirs=args.include_subdirs 

777 ) 

778 with open(output, "w", encoding="utf8") as fd: 

779 fd.write(json_string) 

780 print(f"SPARC parameter specifications written to {output}!") 

781 print("If you need to fintune the definitions, please edit them manually.")