|
import json |
|
from pathlib import Path |
|
from time import perf_counter |
|
from typing import Any, Dict |
|
|
|
from tqdm.auto import tqdm |
|
|
|
|
|
def folder_to_json(folder_in: Path, json_path: Path): |
|
""" |
|
Process JSON lines from files in a given folder and write processed data to a new JSON file. |
|
|
|
Parameters: |
|
folder_in (Path): Path to the input folder containing the JSON files to process. |
|
json_path (Path): Path to the output JSON file where the processed data will be written. |
|
|
|
Example: |
|
folder_to_json(Path("/path/to/input/folder"), Path("/path/to/output.json")) |
|
""" |
|
|
|
json_out = [] |
|
|
|
process_start = perf_counter() |
|
|
|
all_files = sorted(folder_in.rglob('*wiki*'), key=lambda x: str(x)) |
|
|
|
|
|
with tqdm(total=len(all_files), desc='Processing', unit='file') as pbar: |
|
|
|
for file_path in all_files: |
|
|
|
pbar.set_postfix_str(f"File: {file_path.name} | Dir: {file_path.parent}", refresh=True) |
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
|
|
article = json.loads(line) |
|
|
|
json_out.extend([restructure_articles(article)]) |
|
|
|
|
|
pbar.update(1) |
|
time_taken_to_process = perf_counter() - process_start |
|
pbar.write(f"Wiki processed in {round(time_taken_to_process, 2)} seconds!") |
|
|
|
|
|
pbar.write("Writing file!") |
|
write_start = perf_counter() |
|
|
|
with open(json_path, "w", encoding='utf-8') as outfile: |
|
json.dump(json_out, outfile) |
|
time_taken_to_write = perf_counter() - write_start |
|
|
|
pbar.write(f"File written in {round(time_taken_to_write, 2)} seconds!") |
|
|
|
|
|
def restructure_articles(article: Dict[str, Any]) -> Dict[str, Any]: |
|
""" |
|
Restructures the given article into haystack's format, separating content and meta data. |
|
|
|
Args: |
|
- article (Dict[str, Any]): The article to restructure. |
|
|
|
Returns: |
|
- Dict[str, Any]: The restructured article. |
|
""" |
|
|
|
|
|
article_out = { |
|
'content': article['text'], |
|
'meta': {k: v for k, v in article.items() if k != 'text'} |
|
} |
|
|
|
return article_out |
|
|
|
|
|
if __name__ == '__main__': |
|
proj_dir = Path(__file__).parents[2] |
|
folder = proj_dir / 'data/raw/output' |
|
file_out = proj_dir / 'data/consolidated/simple_wiki.json' |
|
folder_to_json(folder, file_out) |
|
print('Done!') |
|
|