Join the waitlist — we'll email you when a spot opens.
pip install runlog-sdk
Or install from source:
pip install git+https://github.com/runlog-in/runlog-sdk.git
from runlogger import RunLogger
logger = RunLogger(
base_url = "https://runlog.in",
project_name = "my-project", # created automatically if missing
api_token = "rl-gb-...", # Dashboard → API Tokens
run_name = "run-1", # optional — auto-generated if omitted
config = {"model": "gpt2", "params": "125M"},
tags = ["baseline", "v1"],
offline_mode = True, # preserve data if connection drops
)
for step in range(1000):
loss = train_one_step()
logger.log(step=step, total_steps=1000, loss=loss, lr=scheduler.get_lr())
if step % 100 == 0:
val_loss = evaluate()
logger.log_eval(step=step, val_loss=val_loss, is_best=val_loss < best)
if logger.should_pause():
save_checkpoint(step)
logger.finish("paused")
break
logger.finish()
Creates a new run and connects to the dashboard.
https://runlog.incosmic-nebula-42)0.["baseline", "fp16", "v2"]log() call. Default: True.True. Requires a supported plan.True.False.[].logger = RunLogger(
base_url = "https://runlog.in",
api_token = "rl-gb-...",
project_name = "llm-pretraining",
run_name = "gpt2-run-3",
config = {"params": "125M", "batch_size": 32, "max_steps": 50000},
start_step = 5000,
tags = ["fp16", "warmup-cosine"],
notes = "Resume from best checkpoint, new LR schedule",
log_system_stats = True,
offline_mode = True,
)
Log training metrics at the current step. Pass any keyword arguments — each becomes a chart on the dashboard. total_steps enables the progress bar. Buffering and rate limiting are handled automatically.
logger.log(
step = step,
total_steps = total_steps,
train_loss = loss.item(),
lr = scheduler.get_last_lr()[0],
tokens_per_sec = tokens_per_sec,
total_tokens = step * batch_size * seq_len,
eta_seconds = (total_steps - step) * step_time,
)
Log evaluation metrics. Tracked separately from training metrics on the dashboard. Pass is_best=True to flag the current best checkpoint.
logger.log_eval(
step = step,
val_loss = val_loss,
ppl = math.exp(val_loss),
accuracy = accuracy,
is_best = is_best,
checkpoint_saved = is_best,
checkpoint_path = "checkpoints/best.pt" if is_best else None,
)
Upload a file artifact attached to the run. Artifacts appear in the run's Artifacts panel. Supported types: model | dataset | image | file.
"best-model".model | dataset | image | file{"val_loss": 0.42, "step": 5000}.logger.log_artifact("checkpoints/best.pt",
name="best-model", type="model",
metadata={"val_loss": 0.42, "step": 5000})
logger.log_artifact("data/train.csv",
name="training-data", type="dataset",
metadata={"rows": 50000})
logger.log_artifact("outputs/confusion_matrix.png",
name="confusion-matrix", type="image")
Returns True if a pause was triggered from the dashboard. Call once per step — the flag clears automatically after being read.
if logger.should_pause():
save_checkpoint(step)
logger.finish("paused")
sys.exit(0)
Mark the run as done. Always call this at the end of your script. Status options: completed | crashed | paused. Waits up to 10 seconds for any pending data before closing.
Automatically calls finish("completed") on normal exit and finish("crashed") if an exception is raised.
with RunLogger(...) as logger:
for step in range(steps):
logger.log(step=step, loss=loss)
RunLogger's offline mode is designed for real-world training conditions where connections are unreliable. Enable it once — everything else is automatic.
logger = RunLogger(
...,
offline_mode = True, # default
)
When offline_mode=True:
Offline mode requires a supported plan. If your plan does not include it, it is disabled automatically at startup with a warning. If your plan is upgraded mid-run, offline mode activates immediately — no restart needed.
offline_mode=Trueoffline_mode=Trueoffline_mode=Falselogger = RunLogger(
base_url = "https://runlog.in",
api_token = "rl-gb-...",
project_name = "my-project",
run_name = "pytorch-run",
config = {"arch": "gpt2", "batch_size": batch_size},
offline_mode = True,
)
try:
for step in range(total_steps):
loss = criterion(model(x), y)
loss.backward()
optimizer.step()
scheduler.step()
logger.log(
step = step,
total_steps = total_steps,
train_loss = loss.item(),
lr = scheduler.get_last_lr()[0],
tokens_per_sec = batch_size * seq_len / step_time,
)
if step % eval_every == 0:
val_loss = evaluate(model, val_loader)
is_best = val_loss < best_loss
if is_best:
torch.save(model.state_dict(), "best.pt")
logger.log_eval(step=step, val_loss=val_loss, is_best=is_best,
checkpoint_path="best.pt" if is_best else None)
if logger.should_pause():
torch.save(model.state_dict(), f"pause_{step}.pt")
logger.finish("paused")
break
logger.finish("completed")
except Exception:
logger.finish("crashed")
raise
For multi-GPU / DDP training, log only from rank 0:
if rank == 0:
logger.log(step=step, loss=loss)
from runlogger import RunLogger
from transformers import TrainerCallback
class RunLoggerCallback(TrainerCallback):
def __init__(self, logger):
self.logger = logger
def on_log(self, args, state, control, logs=None, **kwargs):
if logs:
self.logger.log(step=state.global_step,
total_steps=state.max_steps, **logs)
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
if metrics:
self.logger.log_eval(step=state.global_step, **metrics)
def on_train_end(self, args, state, control, **kwargs):
self.logger.finish()
# usage:
logger = RunLogger(..., offline_mode=True)
trainer = Trainer(..., callbacks=[RunLoggerCallback(logger)])
import tensorflow as tf
from runlogger import RunLogger
class RunLoggerCallback(tf.keras.callbacks.Callback):
def __init__(self, logger, total_epochs):
self.logger = logger
self.total_epochs = total_epochs
def on_epoch_end(self, epoch, logs=None):
self.logger.log(step=epoch, total_steps=self.total_epochs, **(logs or {}))
def on_train_end(self, logs=None):
self.logger.finish()
# usage:
logger = RunLogger(..., offline_mode=True)
model.fit(X, y, epochs=50, callbacks=[RunLoggerCallback(logger, total_epochs=50)])
import xgboost as xgb
from runlogger import RunLogger
class RunLoggerXGBCallback(xgb.callback.TrainingCallback):
def __init__(self, logger, total_rounds):
self.logger = logger
self.total_rounds = total_rounds
def after_iteration(self, model, epoch, evals_log):
metrics = {}
for data, metric_dict in evals_log.items():
for name, vals in metric_dict.items():
metrics[f"{data}_{name}"] = vals[-1]
self.logger.log(step=epoch, total_steps=self.total_rounds, **metrics)
return False
# usage:
logger = RunLogger(..., offline_mode=True)
bst = xgb.train(params, dtrain, num_boost_round=100,
evals=[(dval, "val")],
callbacks=[RunLoggerXGBCallback(logger, 100)])
Log any file as an artifact — models, datasets, plots, configs. Artifacts appear in the run's Artifacts panel and stay associated with the run permanently.
.pt, .pkl, .onnx, …).csv, .jsonl, …)# model checkpoint
logger.log_artifact("checkpoints/best.pt",
name="best-model", type="model",
metadata={"val_loss": 0.42, "step": 5000})
# dataset
logger.log_artifact("data/train.csv",
name="training-data", type="dataset",
metadata={"rows": 50000, "source": "FineWeb"})
# evaluation plot
logger.log_artifact("outputs/confusion_matrix.png",
name="confusion-matrix", type="image")
When optional packages are installed, RunLogger automatically appends hardware metrics to every log() call. These appear as charts alongside your training metrics — no extra code needed.
# install optional dependencies pip install pynvml psutil # disable if not needed logger = RunLogger(..., log_system_stats=False)
Stats are collected from GPU 0. If no GPU is present only CPU/RAM metrics are logged. If neither package is installed, system stats are silently skipped.
Pro and Elite plans support team workspaces. Create a workspace, invite teammates by email, and share projects across your organization.
For team workspaces, go to Workspace in the sidebar. Roles:
Plans and limits are managed from the dashboard's Plans page. Upgrade or downgrade at any time — changes take effect immediately, even mid-run.
When capture_terminal=True (the default), RunLogger intercepts all stdout and stderr output from your training script and streams it to the dashboard in real time alongside your metrics. No extra code needed — print() statements, tqdm progress bars, and framework logs all appear automatically.
logger = RunLogger(
...,
capture_terminal = True, # default — streams all print() output to dashboard
)
If the connection drops mid-run, terminal chunks are stored locally and flushed to the dashboard on reconnect — in order, with no gaps. This requires offline_mode=True.
logger = RunLogger(
...,
capture_terminal = False, # raw stdout only, nothing sent to dashboard
)
Disable if your script produces extremely high-frequency output that you don't need on the dashboard, or if you're running in an environment where stdout redirection is not allowed.
If a run was interrupted and you want to sync its locally buffered data without starting a new run, use the runlogger-sync command:
# scan the default dumps/ directory
runlogger-sync
# scan a specific directory
runlogger-sync --dir /path/to/runs
# sync one specific file
runlogger-sync --file dumps/.runlog_abc123.db
# show full debug output
runlogger-sync --verbose
runlogger-sync -v
dumps/RUNLOGGER_URL.RUNLOGGER_TOKEN.export RUNLOGGER_URL=https://runlog.in
export RUNLOGGER_TOKEN=rl-...
runlogger-sync
The token and server URL are stored inside each DB file, so you usually don't need to pass them manually. Safe to run multiple times — already-synced packets are skipped automatically. Unrecoverable DB files (missing token or payload) are discarded silently.
If you don't provide a run_name, one is generated automatically in the format adjective-noun-number:
cosmic-nebula-42
silver-ridge-317
eager-summit-5
Names are readable, memorable, and unique at any practical project scale. You'll see them on the dashboard and in logs. To use a fixed name instead:
logger = RunLogger(
...,
run_name = "gpt2-baseline-run3",
)
These are raised immediately as RuntimeError before training begins:
RuntimeError: Invalid API token: rl-...
RuntimeError: [Runlog] account is banned.
offline_mode=True, retried automatically on reconnect.try:
with RunLogger(...) as logger:
for step in range(max_steps):
loss = train()
logger.log(step=step, loss=loss)
except RuntimeError as e:
print(f"RunLogger error: {e}")
# continue training without logging, or exit
Pass verbose=True to see internal detail — packet counts, sync intervals, orphan run recovery. Useful for diagnosing connection or sync issues.
logger = RunLogger(..., verbose=True)
No — it is called automatically. Normal exit calls finish("completed"). An exception calls finish("crashed"). The exception is not suppressed.
The run stays marked as running on the dashboard indefinitely. Always call finish() or use the context manager.
Yes. Log only from rank 0 to avoid duplicate data:
if rank == 0:
logger.log(step=step, loss=loss)
No — metric values must be int, float, or bool. Pass strings in config, tags, or notes instead.
Yes. Each RunLogger instance is independent and creates its own run.
No. All logging is non-blocking — your training loop is never slowed down.
If offline_mode=True, all data logged before the crash is preserved and recovered automatically the next time you start a run from the same directory. No manual steps required.
In a dumps/ directory relative to where your training script runs. Files are named .runlog_<run_id>.db and cleaned up automatically after a successful sync.
Pass verbose=True to RunLogger(...) to see full internal detail, or use runlogger-sync --verbose for manual sync debugging.
RunLogger is designed exclusively for use with runlog.in. Self-hosted deployments are not supported. Set base_url to https://runlog.in.