Source code for mirar.processors.sources.json_exporter

"""
Module with classes to write a source table to Json
"""

import json
import logging
from pathlib import Path
from typing import Optional

import pandas as pd

from mirar.data import SourceBatch
from mirar.paths import BASE_NAME_KEY, base_output_dir, get_output_dir
from mirar.processors.base_processor import BaseSourceProcessor

logger = logging.getLogger(__name__)

JSON_METADATA_KEY = "metadata"
JSON_SOURCE_KEY = "sources"
JSON_SUFFIX = ".json"


[docs] class JSONExporter(BaseSourceProcessor): """ Class to export a source table to JSON """ base_key = "JSONEXPORT" def __init__( self, output_dir_name: Optional[str] = None, output_dir: str | Path = base_output_dir, export_keys: Optional[list[str]] = None, ): super().__init__() self.output_dir_name = output_dir_name self.output_dir = Path(output_dir) self.export_keys = export_keys
[docs] def description(self) -> str: return "Processor to save sources to json files."
def _apply_to_sources( self, batch: SourceBatch, ) -> SourceBatch: for source_list in batch: source_table = source_list.get_data().copy() metadata = source_list.get_metadata() binary_cols = [ x for x in source_table.columns if pd.api.types.infer_dtype(source_table[x]) == "bytes" ] if len(binary_cols) > 0: logger.debug( f"The following columns contain binary data: {binary_cols}. " f"These will not be exported to json." ) source_table = source_table.drop(columns=binary_cols) # Use pandas to convert the source table to a json object and back # This ensures json-able data types are used json_data = { JSON_SOURCE_KEY: json.loads(source_table.to_json(orient="records")), JSON_METADATA_KEY: json.loads(pd.Series(metadata).to_json()), } output_dir = get_output_dir( dir_root=self.output_dir_name, sub_dir=self.night_sub_dir, output_dir=self.output_dir, ) output_dir.mkdir(parents=True, exist_ok=True) json_path = output_dir.joinpath( Path(metadata[BASE_NAME_KEY]).with_suffix(JSON_SUFFIX).name ) logger.debug(f"Writing source table to {json_path}") with open(json_path, "w", encoding="utf8") as f: json.dump(json_data, f) return batch