| import json |
| from pathlib import Path |
| from typing import Iterator, Dict |
|
|
| |
| |
| |
| files = [ |
| "pool_multiple_choice_chunk_01.json", |
| "pool_multiple_choice_chunk_02.json", |
| "pool_multiple_choice_chunk_03.json", |
| "pool_multiple_choice_chunk_04.json", |
| "pool_numerical_chunk_01.json", |
| "pool_numerical_chunk_02.json", |
| "pool_numerical_chunk_03.json", |
| "pool_regression_chunk_01.json", |
| ] |
|
|
| out_path = Path("merged_train.json") |
|
|
| |
| |
| |
| def iter_records(path: Path) -> Iterator[Dict]: |
| """ |
| Yields records from a file that can be: |
| - JSONL (one JSON object per line), or |
| - a single JSON array, or |
| - a single JSON object. |
| """ |
| text = path.read_text(encoding="utf-8") |
| |
| try: |
| data = json.loads(text) |
| if isinstance(data, list): |
| for rec in data: |
| yield rec |
| elif isinstance(data, dict): |
| yield data |
| else: |
| raise ValueError(f"Unsupported top-level JSON type in {path}") |
| except json.JSONDecodeError: |
| |
| for i, line in enumerate(text.splitlines(), 1): |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| yield json.loads(line) |
| except json.JSONDecodeError as e: |
| raise ValueError(f"Invalid JSON on line {i} in {path}: {e}") from e |
|
|
| |
| |
| |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| count = 0 |
| with out_path.open("w", encoding="utf-8") as out: |
| out.write("[\n") |
| first = True |
| for fp in files: |
| for rec in iter_records(Path(fp)): |
| if not first: |
| out.write(",\n") |
| out.write(json.dumps(rec, ensure_ascii=False)) |
| first = False |
| count += 1 |
| out.write("\n]") |
|
|
| print(f"✓ Wrote {count} records to {out_path.resolve()}") |
|
|