import json
from pathlib import Path
from typing import Dict, Union
from omegaconf import OmegaConf
[docs]
def is_package_available(package_name: str) -> bool:
try:
import importlib
package_exists = importlib.util.find_spec(package_name) is not None
return package_exists
except Exception:
return False
[docs]
def read_jsonl_to_mapping(
jsonl_file: Union[str, Path, list[Union[str, Path]]],
key_col: str,
value_col: str,
) -> Dict[str, str]:
"""
Read two columns, indicated by `key_col` and `value_col`, from the
given jsonl file to return the mapping dict
TODO handle duplicate keys
"""
mapping = {}
if not isinstance(jsonl_file, list):
jsonl_files = [jsonl_file]
else:
jsonl_files = jsonl_file
for jsonl_file in jsonl_files:
with open(jsonl_file, 'r') as file:
for line in file.readlines():
data = json.loads(line.strip())
key = data[key_col]
value = data[value_col]
mapping[key] = value
return mapping
[docs]
def setup_resume_cfg(config: dict, do_print: bool = True):
if "resume_from_checkpoint" in config["trainer"]:
ckpt_dir = Path(config["trainer"]["resume_from_checkpoint"])
if "resume_from_config" in config["trainer"]:
resumed_config = config["trainer"]["resume_from_config"]
else:
exp_dir = ckpt_dir.parent.parent
resumed_config = exp_dir / "config.yaml"
resumed_config = OmegaConf.load(resumed_config)
resumed_config["trainer"].update({
"resume_from_checkpoint": ckpt_dir.__str__(),
"logging_config": config["trainer"]
["logging_config"], # for resume wandb runs
})
elif config.get("auto_reusme_from_latest_ckpt", False):
exp_dir = Path(config["exp_dir"])
ckpt_root = exp_dir / "checkpoints"
if ckpt_root.is_dir() and any(p.is_dir() for p in ckpt_root.iterdir()):
# use last ckpt
ckpt_dir: Path = sorted((exp_dir / "checkpoints").iterdir())[-1]
resumed_config = OmegaConf.load(exp_dir / "config.yaml")
resumed_config["trainer"].update({
"resume_from_checkpoint": ckpt_dir.__str__(),
"logging_config": config["trainer"]
["logging_config"], # for resume wandb runs
})
else:
resumed_config = config
else:
resumed_config = config
if do_print:
if "resume_from_checkpoint" in resumed_config["trainer"]:
print(
f'\n train will resume from checkpoint: {resumed_config["trainer"]["resume_from_checkpoint"]}\n '
)
else:
print('\n train will start from scratch\n ')
return resumed_config