Coverage for sparc/docparser.py: 83%

380 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-18 16:19 +0000

1# -*- coding: utf-8 -*- 

2""" 

3A module to parse the latex documents provided by SPARC 

4and convert to its Python API 

5 

6Created on Wed Mar 1 15:32:31 EST 2023 

7 

8Tian Tian (alchem0x2a@gmail.com) 

9""" 

10import json 

11import re 

12from copy import copy 

13from datetime import datetime 

14from pathlib import Path 

15from warnings import warn 

16 

17import numpy as np 

18 

19# Some fields in master SPARC doc may cause auto type detection 

20# to fail, need hard-coded post-processing for now 

21postprocess_items = { 

22 "RELAX_FLAG": {"allow_bool_input": False}, 

23 "NPT_SCALE_CONSTRAINTS": {"type": "string"}, 

24 "NPT_SCALE_VECS": {"type": "integer array"}, 

25 "TOL_POISSON": {"type": "double"}, 

26} 

27 

28sparc_repo_url = "https://github.com/SPARC-X/SPARC.git" 

29 

30 

31class SparcDocParser(object): 

32 """Parses LaTeX documentation of SPARC-X and converts it into a Python API. 

33 

34 This class extracts parameter information from LaTeX source files, 

35 organizing it into a structured format that can be easily used in 

36 Python. It supports parsing of version details, parameter types, 

37 units, and other relevant information. 

38 

39 Attributes: 

40 version (str): Parsed SPARC version, based on the documentation. 

41 parameter_categories (list): Categories of parameters extracted. 

42 parameters (dict): Extracted parameters with detailed information. 

43 other_parameters (dict): Additional parameters not categorized. 

44 suppress_warnings (bool): Whether the doc parser suppress UserWarning (may be annoying during class import) 

45 

46 Methods: 

47 find_main_file(main_file_pattern): Finds the main LaTeX file based on a pattern. 

48 get_include_files(): Retrieves a list of included LaTeX files. 

49 parse_version(parse): Parses and sets the SPARC version. 

50 parse_parameters(): Extracts parameters from LaTeX files. 

51 postprocess(): Applies hard-coded post-processing to some parameters. 

52 to_dict(): Converts parsed information into a dictionary. 

53 json_from_directory(directory, include_subdirs, **kwargs): Class method to create JSON from a directory. 

54 json_from_repo(url, version, include_subdirs, **kwargs): Class method to create JSON from a repository. 

55 

56 """ 

57 

58 def __init__( 

59 self, 

60 directory=".", 

61 main_file="*Manual.tex", 

62 intro_file="Introduction.tex", 

63 params_from_intro=True, 

64 parse_version=True, 

65 suppress_warnings=True, 

66 ): 

67 """Create the doc parser pointing to the root of the doc file of SPARC 

68 

69 The SPARC doc is organized as follows: 

70 SPARC/doc/.LaTeX/ 

71 |---- Manual.tex 

72 |---- Introduction.tex 

73 |---- {Section}.tex 

74 

75 For parameters additional to the standard SPARC options, such as the SQ / cyclix 

76 options, we merge the dict from the sub-dirs 

77 

78 Args: 

79 doc_root: root directory to the LaTeX files, may look like `SPARC/doc/.LaTeX` 

80 main_file: main LaTeX file for the manual 

81 intro_file: LaTeX file for the introduction 

82 params_from_intro: only contain the parameters that can be parsed in `intro_file` 

83 parse_date: get the SPARC version by date 

84 suppress_warnings: whether to silence any warnings generated by the parser 

85 """ 

86 self.suppress_warnings = suppress_warnings 

87 self.params_from_intro = params_from_intro 

88 self.root = Path(directory) 

89 self.intro_file = self.root / intro_file 

90 self.main_file = self.find_main_file(main_file) 

91 if not self.intro_file.is_file(): 

92 raise FileNotFoundError(f"Introduction file {intro_file} is missing!") 

93 self.include_files = self.get_include_files() 

94 self.parse_version(parse_version) 

95 self.parse_parameters() 

96 self.postprocess() 

97 

98 def find_main_file(self, main_file_pattern): 

99 """ 

100 Finds the main LaTeX file that matches the given pattern, e.g. Manual.tex or Manual_cyclix.te 

101 

102 Args: 

103 main_file_pattern (str): Pattern to match the main LaTeX file name. 

104 

105 Returns: 

106 Path: Path to the main LaTeX file. 

107 

108 Raises: 

109 FileNotFoundError: If no or multiple files match the pattern. 

110 """ 

111 candidates = list(self.root.glob(main_file_pattern)) 

112 if len(candidates) != 1: 

113 raise FileNotFoundError( 

114 f"Main file {main_file_pattern} is missing or more than 1 exists!" 

115 ) 

116 return candidates[0] 

117 

118 def get_include_files(self): 

119 """ 

120 Retrieves a list of LaTeX files included in the main LaTeX document, e.g. Manual.tex. 

121 

122 Returns: 

123 list: A list of paths to the included LaTeX files. 

124 """ 

125 pattern = r"\\begin\{document\}(.*?)\\end\{document\}" 

126 text = open(self.main_file, "r", encoding="utf8").read() 

127 # Only the first begin/end document will be matched 

128 match = re.findall(pattern, text, re.DOTALL)[0] 

129 pattern_include = r"\\include\{(.+?)\}" 

130 include = re.findall(pattern_include, match, re.DOTALL) 

131 include_files = [] 

132 for name in include: 

133 tex_file = self.root / f"{name}.tex" 

134 if tex_file.is_file(): 

135 include_files.append(tex_file) 

136 else: 

137 if not self.suppress_warnings: 

138 warn( 

139 ( 

140 f"TeX file {tex_file} is missing! It may be a typo in the document, " 

141 "ignore parameters from this file." 

142 ) 

143 ) 

144 return include_files 

145 

146 def parse_version(self, parse=True): 

147 """ 

148 Parses and sets the SPARC version based on the C-source file, if possible. 

149 The date for the SPARC code is parsed from initialization.c in the "YYYY.MM.DD" 

150 format. 

151 

152 Args: 

153 parse (bool): Whether to parse the version from the documentation. 

154 

155 Sets: 

156 self.version (str): The parsed version in 'YYYY.MM.DD' format or None, 

157 if either parse=False, or the C-source code is missing 

158 """ 

159 if parse is False: 

160 self.version = None 

161 return 

162 init_c = self.root.parents[1] / "src" / "initialization.c" 

163 if not init_c.is_file(): 

164 if not self.suppress_warnings: 

165 warn( 

166 'Cannot find the c source file "initialization.c", skip version parsing!' 

167 ) 

168 self.version = None 

169 return 

170 text = open(init_c, "r", encoding="utf8").read() 

171 pattern_version = r"SPARC\s+\(\s*?version(.*?)\)" 

172 match = re.findall(pattern_version, text) 

173 if len(match) != 1: 

174 if not self.suppress_warnings: 

175 warn( 

176 'Parsing c source file "initialization.c" for version is unsuccessful!' 

177 ) 

178 self.version = None 

179 return 

180 # We need to add more spacing matching in case the source code includes extra 

181 date_str = re.sub(r"\s+", " ", match[0].strip().replace(",", " ")) 

182 # Older version of SPARC doc may contain abbreviated month format 

183 date_version = None 

184 for fmt in ("%b %d %Y", "%B %d %Y"): 

185 try: 

186 date_version = datetime.strptime(date_str, fmt).strftime("%Y.%m.%d") 

187 break 

188 except Exception: 

189 continue 

190 if date_version is None: 

191 raise ValueError(f"Cannot parse date time {date_str}") 

192 self.version = date_version 

193 return 

194 

195 def __parse_parameter_from_frame(self, frame): 

196 """Parse the parameters from a single LaTeX frame 

197 

198 Args: 

199 frame (str): a string containing the LaTeX frame (e.g. \\begin{frame} ... \\end{frame}) 

200 

201 Returns: 

202 dict: a key-value paired dict parsed from the frame. Some field names include: 

203 name: TOL_POISSON 

204 type: Double | Integer | String | Character | Double array 

205 unit: specified in the doc 

206 """ 

207 pattern_label = r"\\texttt\{(.*?)\}.*?\\label\{(.*?)\}" 

208 pattern_block = r"\\begin\{block\}\{(.*?)\}([\s\S]*?)\\end\{block\}" 

209 match_label = re.findall(pattern_label, frame, re.DOTALL | re.MULTILINE) 

210 if len(match_label) != 1: 

211 if not self.suppress_warnings: 

212 warn("Provided a non-structured frame for parsing, skip.") 

213 return {} 

214 symbol, label = ( 

215 convert_tex_parameter(match_label[0][0].strip()), 

216 match_label[0][1].strip(), 

217 ) 

218 # Every match contains the (name, content) pair of the blocks 

219 matches = re.findall(pattern_block, frame, re.DOTALL | re.MULTILINE) 

220 param_dict = {"symbol": symbol, "label": label} 

221 # TODO: add more type definition 

222 for key, content in matches: 

223 key = key.lower() 

224 content = content.strip() 

225 # Do not parse commented-out values 

226 

227 if (key == "type") and (content.startswith("%")): 

228 if not self.suppress_warnings: 

229 warn(f"Parameter {symbol} is disabled in the doc, ignore!") 

230 return {} 

231 if key in ("example",): 

232 content = convert_tex_example(content) 

233 param_dict[key] = content 

234 # Sanitize 1: Convert types 

235 param_dict = sanitize_type(param_dict, suppress_warnings=self.suppress_warnings) 

236 # Sanitize 2: Convert default values 

237 param_dict = sanitize_default( 

238 param_dict, suppress_warnings=self.suppress_warnings 

239 ) 

240 # Sanitize 3: Remove TeX components in description and remark 

241 param_dict = sanitize_description(param_dict) 

242 

243 return param_dict 

244 

245 def __parse_frames_from_text(self, text): 

246 """Extract all the frames that aren't commented in the text 

247 

248 Arguments: 

249 text (str): Full LaTeX text 

250 Returns: 

251 list: Matched LaTeX Beamer frame fragments 

252 """ 

253 pattern_frame = r"\\begin\{frame\}(.*?)\\end\{frame\}" 

254 matches = re.findall(pattern_frame, text, re.DOTALL | re.MULTILINE) 

255 return matches 

256 

257 def __parse_intro_file(self): 

258 """Parse the introduction file 

259 

260 Returns: 

261 parameter_dict (dict): dictionary using the parameter category as the main key 

262 (following order in Introduction.tex) 

263 parameter_categories (list): list of categories 

264 """ 

265 text_intro = open(self.intro_file, "r", encoding="utf8").read() 

266 pattern_params = ( 

267 r"^\\begin\{frame\}.*?\{Input file options\}.*?$(.*?)\\end\{frame\}" 

268 ) 

269 pattern_block = r"\\begin\{block\}\{(.*?)\}([\s\S]*?)\\end\{block\}" 

270 pattern_line = r"\\hyperlink\{(.*?)\}{\\texttt\{(.*?)\}\}" 

271 text_params = re.findall(pattern_params, text_intro, re.DOTALL | re.MULTILINE)[ 

272 0 

273 ] 

274 parameter_categories = [] 

275 parameter_dict = {} 

276 for match in re.findall(pattern_block, text_params): 

277 cat = match[0].lower() 

278 if cat in parameter_categories: 

279 raise ValueError( 

280 f"Key {cat} already exists! You might have a wrong LaTeX doc file!" 

281 ) 

282 parameter_categories.append(cat) 

283 parameter_dict[cat] = [] 

284 param_lines = match[1].split("\n") 

285 for line in param_lines: 

286 matches = re.findall(pattern_line, line) 

287 if len(matches) == 0: 

288 continue 

289 # Each match should contain 2 items, the "Link" that matches a reference in included-tex files 

290 # symbol is the actual symbol name (in text-format) 

291 # In most cases the link and symbol should be the same 

292 for match in matches: 

293 label, symbol = match[0].strip(), convert_tex_parameter( 

294 match[1].strip() 

295 ) 

296 parameter_dict[cat].append({"label": label, "symbol": symbol}) 

297 return parameter_categories, parameter_dict 

298 

299 def __parse_all_included_files(self): 

300 """Pop up all known parameters from included files 

301 Returns: 

302 dict: All known parameters from included files 

303 """ 

304 all_params = {} 

305 for f in self.include_files: 

306 # Do not parse intro file since it's waste of time 

307 if f.resolve() == self.intro_file.resolve(): 

308 continue 

309 text = open(f, "r", encoding="utf8").read() 

310 frames = self.__parse_frames_from_text(text) 

311 for frame in frames: 

312 dic = self.__parse_parameter_from_frame(frame) 

313 if len(dic) > 0: 

314 label = dic["label"] 

315 all_params[label] = dic 

316 return all_params 

317 

318 def parse_parameters(self): 

319 """The actual thing for parsing parameters 

320 

321 Sets: 

322 parameters (dict): All parsed parameters 

323 parameter_categoris (list): List of categories 

324 other_parameters (dict): Any parameters that are not included in the categories 

325 """ 

326 parameter_categories, parameter_dict = self.__parse_intro_file() 

327 all_params = self.__parse_all_included_files() 

328 self.parameter_categories = parameter_categories 

329 # parameters contain only the "valid" ones that are shown in the intro 

330 # all others are clustered in "other_parameters" 

331 self.parameters = {} 

332 for cat, params in parameter_dict.items(): 

333 for p in params: 

334 label = p["label"] 

335 symbol = p["symbol"] 

336 param_details = all_params.pop(label, {}) 

337 if param_details != {}: 

338 param_details["category"] = cat 

339 self.parameters[symbol] = param_details 

340 

341 self.other_parameters = {} 

342 for param_details in all_params.values(): 

343 symbol = param_details["symbol"] 

344 self.other_parameters[symbol] = param_details 

345 return 

346 

347 def postprocess(self): 

348 """Use the hardcoded dict prostprocess_items to fix some issues""" 

349 for param, fix in postprocess_items.items(): 

350 if param in self.parameters: 

351 self.parameters[param].update(**fix) 

352 return 

353 

354 def to_dict(self): 

355 """Output a json dict from current document parser 

356 

357 Returns: 

358 dict: All API schemes in dict 

359 """ 

360 doc = {} 

361 doc["sparc_version"] = self.version 

362 doc["categories"] = self.parameter_categories 

363 doc["parameters"] = {k: v for k, v in sorted(self.parameters.items())} 

364 doc["other_parameters"] = { 

365 k: v for k, v in sorted(self.other_parameters.items()) 

366 } 

367 doc["data_types"] = sorted(set([p["type"] for p in self.parameters.values()])) 

368 return doc 

369 

370 @classmethod 

371 def json_from_directory(cls, directory=".", include_subdirs=True, **kwargs): 

372 """ 

373 Recursively add parameters from all Manual files 

374 Arguments: 

375 directory (str or PosixPath): The directory to the LaTeX files, e.g. <sparc-root>/doc/.LaTeX 

376 include_subdirs (bool): If true, also parse the manual files in submodules, e.g. cyclix, highT 

377 Returns: 

378 str: Formatted json-string of the API 

379 """ 

380 directory = Path(directory) 

381 root_dict = cls(directory=directory, **kwargs).to_dict() 

382 if include_subdirs: 

383 for sub_manual_tex in directory.glob("*/*Manual.tex"): 

384 subdir = sub_manual_tex.parent 

385 try: 

386 sub_dict = cls(directory=subdir, parse_version=False).to_dict() 

387 except FileNotFoundError: 

388 print( 

389 subdir, 

390 " Latex files not found. Check naming conventions for Manual.tex. Expects format *Manual.tex", 

391 ) 

392 continue 

393 for param, param_desc in sub_dict["parameters"].items(): 

394 if param not in root_dict["parameters"]: 

395 root_dict["parameters"][param] = param_desc 

396 # Combine the subdir categories 

397 for sub_category in sub_dict["categories"]: 

398 if sub_category not in root_dict["categories"]: 

399 root_dict["categories"].append(sub_category) 

400 # Combine data types 

401 for sub_dt in sub_dict["data_types"]: 

402 if sub_dt not in root_dict["data_types"]: 

403 root_dict["data_types"].append(sub_dt) 

404 

405 json_string = json.dumps(root_dict, indent=True) 

406 return json_string 

407 

408 @classmethod 

409 def json_from_repo( 

410 cls, url=sparc_repo_url, version="master", include_subdirs=True, **kwargs 

411 ): 

412 """ 

413 Download the source code from git and use json_from_directory to parse 

414 Arguments: 

415 url (str): URL for the repository of SPARC, default is "https://github.com/SPARC-X/SPARC.git" 

416 version (str): Git version or commit hash of the SPARC repo 

417 include_subdirs (bool): If true, also parse the manual files in submodules, e.g. cyclix, highT 

418 Returns: 

419 str: Formatted json-string of the API 

420 """ 

421 import tempfile 

422 from subprocess import run 

423 

424 with tempfile.TemporaryDirectory() as tmpdir: 

425 tmpdir = Path(tmpdir) 

426 download_dir = tmpdir / "SPARC" 

427 download_cmds = ["git", "clone", "--depth", "1", str(url), "SPARC"] 

428 run(download_cmds, cwd=tmpdir) 

429 if version not in ["master", "HEAD"]: 

430 fetch_cmds = ["git", "fetch", "--depth", "1", str(version)] 

431 run(fetch_cmds, cwd=download_dir) 

432 checkout_cmds = ["git", "checkout", str(version)] 

433 run(checkout_cmds, cwd=download_dir) 

434 json_string = cls.json_from_directory( 

435 directory=download_dir / "doc" / ".LaTeX", 

436 include_subdirs=include_subdirs, 

437 **kwargs, 

438 ) 

439 return json_string 

440 

441 

442def convert_tex_parameter(text): 

443 """ 

444 Conver a TeX string to non-escaped name (for parameter only) 

445 Arguments: 

446 text (str): Parameter name in LaTeX format 

447 Returns: 

448 str: Text with sanitized parameter 

449 """ 

450 return text.strip().replace("\_", "_") 

451 

452 

453def convert_tex_example(text): 

454 """Convert TeX codes of examples as much as possible 

455 The examples follow the format 

456 SYMBOL: values (may contain new lines) 

457 Arguments: 

458 text (str): Single or multiline LaTeX contents 

459 Returns: 

460 str: Sanitized literal text 

461 """ 

462 mapper = {"\\texttt{": "", "\_": "_", "}": "", "\\": "\n"} 

463 new_text = copy(text) 

464 for m, r in mapper.items(): 

465 new_text = new_text.replace(m, r) 

466 symbol, values = new_text.split(":", maxsplit=1) 

467 symbol = symbol.strip() 

468 values = re.sub("\n+", "\n", values.strip()) 

469 # Remove all comment lines 

470 values = "\n".join( 

471 [l for l in values.splitlines() if not l.lstrip().startswith("%")] 

472 ) 

473 new_text = f"{symbol}: {values}" 

474 return new_text 

475 

476 

477def convert_tex_default(text, desired_type=None, suppress_warnings=False): 

478 """Convert default values as much as possible. 

479 The desire type will convert the default values 

480 to the closest format 

481 

482 Currently supported conversions 

483 1. Remove all surrounding text modifiers (texttt) 

484 2. Remove all symbol wrappers $ 

485 3. Convert value to single or array 

486 

487 Arguments: 

488 text (str): Raw text string for value 

489 desired_type (str or None): Data type to be converted to. If None, preserve the string format 

490 

491 Returns: 

492 converted: Value converted from raw text 

493 """ 

494 mapper = { 

495 "\\texttt{": "", 

496 "}": "", 

497 "{": "", 

498 "\\_": "_", 

499 "\_": "_", 

500 "\\\\": "\n", 

501 "$": "", 

502 } 

503 text = text.strip() 

504 text = re.sub(r"\\hyperlink\{.*?\}", "", text) 

505 text = re.sub(r"\\times", "x", text) 

506 for m, r in mapper.items(): 

507 text = text.replace(m, r) 

508 text = re.sub(r"\n+", "\n", text) 

509 # Remove all comment lines 

510 text = "\n".join([l for l in text.splitlines() if not l.lstrip().startswith("%")]) 

511 

512 # print(text) 

513 converted = None 

514 if "none" in text.lower(): 

515 converted = None 

516 elif "no default" in text.lower(): 

517 converted = None 

518 elif "automat" in text.lower(): 

519 converted = "auto" 

520 else: 

521 # try type conversion 

522 if desired_type is None: 

523 converted = text 

524 elif desired_type == "string": 

525 converted = text 

526 else: 

527 converted = text2value( 

528 text, desired_type, suppress_warnings=suppress_warnings 

529 ) 

530 return converted 

531 

532 

533def convert_comment(text): 

534 """Used to remove TeX-specific commands in description and remarks 

535 as much as possible 

536 

537 Arguments: 

538 text (str): Raw LaTeX code for the comment section in manual 

539 

540 Returns: 

541 str: Sanitized plain text 

542 """ 

543 mapper = { 

544 "\\texttt{": "", 

545 "}": "", 

546 "{": "", 

547 "\\_": "_", 

548 "\_": "_", 

549 "\\\\": "\n", 

550 "$": "", 

551 } 

552 text = text.strip() 

553 text = re.sub(r"\\hyperlink\{.*?\}", "", text) 

554 text = re.sub(r"\\href\{.*?\}", "", text) 

555 text = re.sub(r"\\times", "x", text) 

556 for m, r in mapper.items(): 

557 text = text.replace(m, r) 

558 text = re.sub(r"\n+", "\n", text) 

559 # Remove all comment lines 

560 text = "\n".join([l for l in text.splitlines() if not l.lstrip().startswith("%")]) 

561 return text 

562 

563 

564def text2value(text, desired_type, suppress_warnings=False): 

565 """Convert raw text to a desired type 

566 

567 Arguments: 

568 text (str): Text contents for the value 

569 desired_type (str): Target data type from 'string', 'integer', 

570 'integer array', 'double', 'double array', 

571 'bool', 'bool array' 

572 suppress_warnings (bool): Suppress UserWarning if overwhelming for end-users 

573 Returns: 

574 converted: Value converted to the desired type 

575 """ 

576 if desired_type is None: 

577 return text 

578 desired_type = desired_type.lower() 

579 if desired_type == "string": 

580 return text.strip() 

581 

582 try: 

583 arr = np.genfromtxt(text.splitlines(), delimiter=" ", dtype=float) 

584 if np.isnan(arr).any(): 

585 if not suppress_warnings: 

586 warn( 

587 f"Some fields in {text} cannot converted to a numerical array, will skip conversion." 

588 ) 

589 arr = None 

590 except Exception as e: 

591 if not suppress_warnings: 

592 warn( 

593 f"Cannot transform {text} to array, skip converting. Error message is:\n {e}" 

594 ) 

595 arr = None 

596 

597 if arr is None: 

598 return None 

599 

600 # Upshape ndarray to at least 1D 

601 if arr.shape == (): 

602 arr = np.reshape(arr, [1]) 

603 

604 converted = None 

605 from contextlib import suppress 

606 

607 # Ignore all failures and make conversion None 

608 with suppress(Exception): 

609 if desired_type == "integer": 

610 converted = int(arr[0]) 

611 elif desired_type == "bool": 

612 converted = bool(arr[0]) 

613 elif desired_type == "double": 

614 converted = float(arr[0]) 

615 elif desired_type == "integer array": 

616 converted = np.ndarray.tolist(arr.astype(int)) 

617 elif desired_type == "bool array": 

618 converted = np.ndarray.tolist(arr.astype(bool)) 

619 elif desired_type == "double array": 

620 converted = np.ndarray.tolist(arr.astype(float)) 

621 return converted 

622 

623 

624def is_array(text): 

625 """Simply try to convert a string into a numpy array and compare if length is larger than 1 

626 it is only used to compare a float / int value 

627 """ 

628 val = np.fromstring(text, sep=" ") 

629 if len(val) == 1: 

630 return False 

631 else: 

632 return True 

633 

634 

635def contain_only_bool(text): 

636 """Check if a string only contains 0 1 or spaces""" 

637 if any([c in text for c in (".", "+", "-", "e", "E")]): 

638 return False 

639 digits = re.findall(r"[-+e\d]+", text, re.DOTALL) 

640 for d in digits: 

641 val = int(d) 

642 if val not in (0, 1): 

643 return False 

644 return True 

645 

646 

647def sanitize_description(param_dict): 

648 """Sanitize the description and remark field 

649 

650 Arguments: 

651 param_dict (dict): Raw dict for one parameter entry 

652 

653 Returns: 

654 dict: Sanitized parameter dict with comment, remark and description 

655 converted to human-readable formats 

656 """ 

657 sanitized_dict = param_dict.copy() 

658 

659 original_desc = sanitized_dict["description"] 

660 sanitized_dict["description_raw"] = original_desc 

661 

662 original_remark = sanitized_dict.get("remark", "") 

663 sanitized_dict["remark_raw"] = original_remark 

664 

665 sanitized_dict["description"] = convert_comment(original_desc) 

666 sanitized_dict["remark"] = convert_comment(original_remark) 

667 return sanitized_dict 

668 

669 

670def sanitize_default(param_dict, suppress_warnings=False): 

671 """Sanitize the default field 

672 1. Create an extra field `default_remark` that copies original default 

673 2. Use `convert_tex_default` to convert values as much as possible 

674 

675 This function should be called after sanitize_type 

676 """ 

677 sanitized_dict = param_dict.copy() 

678 original_default = sanitized_dict["default"] 

679 sanitized_dict["default_remark"] = original_default 

680 converted_default = convert_tex_default( 

681 original_default, param_dict["type"], suppress_warnings=suppress_warnings 

682 ) 

683 sanitized_dict["default"] = converted_default 

684 return sanitized_dict 

685 

686 

687def sanitize_type(param_dict, suppress_warnings=False): 

688 """Sanitize the param dict so that the type are more consistent 

689 

690 For example, if type is Double / Integer, 

691 but parameter is a vector, 

692 make a double vector or integer vector 

693 """ 

694 sanitized_dict = param_dict.copy() 

695 symbol = param_dict["symbol"] 

696 origin_type = param_dict.get("type", None) 

697 if origin_type is None: 

698 print("Dict does not have type!") 

699 return sanitized_dict 

700 origin_type = origin_type.lower() 

701 

702 sanitized_type = None 

703 sanitized_dict["allow_bool_input"] = False 

704 # First pass, remove all singular types 

705 if origin_type == "0 or 1": 

706 origin_type = "integer" 

707 elif "permutation" in origin_type: 

708 sanitized_type = "integer" 

709 elif origin_type in ("string", "character"): 

710 sanitized_type = "string" 

711 elif "array" in origin_type: 

712 sanitized_type = origin_type 

713 

714 # Pass 2, test if int values are arrays 

715 if (origin_type in ["int", "integer", "double"]) and (sanitized_type is None): 

716 if "int" in origin_type: 

717 origin_type = "integer" 

718 # Test if the value from example is a single value or array 

719 try: 

720 example_value = param_dict["example"].split(":")[1] 

721 default = param_dict["default"] 

722 _array_test = is_array(example_value) 

723 _bool_test = contain_only_bool(example_value) and contain_only_bool(default) 

724 except Exception as e: 

725 if not suppress_warnings: 

726 warn( 

727 f"Array conversion failed for {example_value}, ignore." 

728 f"The error is {e}" 

729 ) 

730 _array_test = False # Retain 

731 _bool_test = False 

732 

733 if _array_test is True: 

734 sanitized_type = f"{origin_type} array" 

735 else: 

736 sanitized_type = origin_type 

737 

738 # Pass 3: int to boolean test. This should be done very tight 

739 if _bool_test and ("integer" in sanitized_type): 

740 sanitized_dict["allow_bool_input"] = True 

741 

742 if sanitized_type is None: 

743 # Currently there is only one NPT_NH_QMASS has this type 

744 # TODO: think of a way to format a mixed array? 

745 if not suppress_warnings: 

746 warn(f"Type of {symbol} if not standard digit or array, mark as others.") 

747 sanitized_type = "other" 

748 # TODO: how about provide a true / false type? 

749 sanitized_dict["type"] = sanitized_type 

750 return sanitized_dict 

751 

752 

753if __name__ == "__main__": 

754 # Run the module as independent script to extract a json-formatted parameter list 

755 from argparse import ArgumentParser 

756 

757 argp = ArgumentParser(description="Parse the LaTeX doc to json") 

758 argp.add_argument( 

759 "-o", 

760 "--output", 

761 default="parameters.json", 

762 help="Output file name (json-formatted)", 

763 ) 

764 argp.add_argument( 

765 "--include-subdirs", 

766 action="store_true", 

767 help="Parse manual parameters from subdirs", 

768 ) 

769 argp.add_argument("--git", action="store_true") 

770 argp.add_argument( 

771 "--version", 

772 default="master", 

773 help="Version of the doc. Only works when using git repo", 

774 ) 

775 argp.add_argument( 

776 "root", 

777 nargs="?", 

778 help=( 

779 "Root of the SPARC doc LaTeX files, or remote git repo link. If not provided and --git is enables, use the default github repo" 

780 ), 

781 ) 

782 

783 args = argp.parse_args() 

784 output = Path(args.output).with_suffix(".json") 

785 if args.git: 

786 if args.root is None: 

787 root = sparc_repo_url 

788 else: 

789 root = args.root 

790 json_string = SparcDocParser.json_from_repo( 

791 url=root, version=args.version, include_subdirs=args.include_subdirs 

792 ) 

793 else: 

794 json_string = SparcDocParser.json_from_directory( 

795 directory=Path(args.root), include_subdirs=args.include_subdirs 

796 ) 

797 with open(output, "w", encoding="utf8") as fd: 

798 fd.write(json_string) 

799 print(f"SPARC parameter specifications written to {output}!") 

800 print("If you need to finetune the definitions, please edit them manually.")