How to Automate GROMACS Analysis with Python: RMSD, RMSF and Energy Plots
Every GROMACS analysis workflow involves the same sequence of terminal commands, interactive group selections, and XVG file parsing repeated across every simulation. Python’s subprocess module automates all of it — this tutorial builds a reusable pipeline that runs gmx commands, parses the output, and generates all standard analysis plots with a single script.
Running gmx commands from Python
Python’s subprocess module runs any shell command and captures its output. For GROMACS, the key challenge is that most gmx commands are interactive — they prompt you to choose atom groups. Automating this requires piping the group numbers through stdin.
import subprocess
import os
def run_gmx(command, stdin_input=None, workdir="."):
"""
Run a gmx command from Python.
stdin_input: string to pipe to stdin (for group selection prompts).
Returns (stdout, stderr, returncode).
"""
result = subprocess.run(
command,
input=stdin_input,
capture_output=True,
text=True,
cwd=workdir,
)
if result.returncode != 0:
print(f"GROMACS error:\n{result.stderr[-500:]}")
return result
# Run gmx energy — pipe "Potential\n" to select potential energy
result = run_gmx(
["gmx", "energy", "-f", "md.edr", "-o", "potential.xvg"],
stdin_input="Potential\n0\n" # "0" exits the selection prompt
)
print("Energy extraction:", "OK" if result.returncode == 0 else "FAILED")
# Run gmx rms (backbone RMSD, protein vs protein)
result = run_gmx(
["gmx", "rms", "-s", "md.tpr", "-f", "md.xtc",
"-o", "rmsd.xvg", "-tu", "ns"],
stdin_input="Backbone\nBackbone\n" # group for fit, group for RMSD
)
# Run gmx rmsf (per-residue RMSF, backbone)
result = run_gmx(
["gmx", "rmsf", "-s", "md.tpr", "-f", "md.xtc",
"-o", "rmsf.xvg", "-res"], # -res: per-residue output
stdin_input="Backbone\n"
)
"Backbone", "Protein", and "System" are always available. For a protein-ligand system, the ligand typically appears as a named group like "LIG" or "MOL". Run gmx make_ndx -f topology.gro -o index.ndx to see all available groups, then pipe the names (not numbers) to gmx commands — GROMACS accepts both.
| gmx command | What it produces | stdin groups needed |
|---|---|---|
| gmx energy | Energy terms from EDR file → XVG | Term names e.g. “Potential Temperature Pressure” |
| gmx rms | RMSD over trajectory → XVG | Fit group, RMSD group (e.g. “Backbone Backbone”) |
| gmx rmsf | Per-residue RMSF → XVG | Group to analyze (e.g. “Backbone”) |
| gmx gyrate | Radius of gyration → XVG | Group (e.g. “Protein”) |
| gmx hbond | H-bond count → XVG | Two groups (donor, acceptor) |
| gmx trjconv | Process / convert trajectory | Output group, PBC handling group |
Parsing XVG output files
GROMACS outputs all numeric data as XVG files — a simple text format with metadata header lines starting with @ or #, followed by whitespace-separated data columns. NumPy’s loadtxt handles this in one line with the comments argument.
0.000000 0.000000
0.010000 0.042381
0.020000 0.058194
…
import numpy as np
def parse_xvg(filepath):
"""
Parse a GROMACS XVG file into numpy arrays.
Returns (data, labels) where data has shape (n_frames, n_columns).
"""
labels = {}
data_lines = []
with open(filepath) as f:
for line in f:
line = line.strip()
if line.startswith("#"): continue
if line.startswith("@"):
# Extract axis labels from metadata
if "xaxis" in line and "label" in line:
labels["x"] = line.split('"')[1]
elif "yaxis" in line and "label" in line:
labels["y"] = line.split('"')[1]
continue
try:
data_lines.append([float(x) for x in line.split()])
except ValueError:
continue # skip lines that don't parse as numbers
return np.array(data_lines), labels
# Load and inspect an XVG file
data, labels = parse_xvg("rmsd.xvg")
times = data[:, 0] # first column is always time
values = data[:, 1] # second column is the metric
print(f"Loaded {len(times)} frames: {labels}")
print(f"Time range: {times[0]:.1f} – {times[-1]:.1f} (x-axis units from XVG header)")
Energy analysis — potential, temperature, pressure
def run_energy_analysis(edr_file, workdir="."):
"""Extract potential energy, temperature, and pressure from EDR."""
terms = {
"potential": ("Potential\n0\n", "energy_potential.xvg"),
"temperature": ("Temperature\n0\n", "energy_temp.xvg"),
"pressure": ("Pressure\n0\n", "energy_pressure.xvg"),
}
results = {}
for term_name, (stdin, outfile) in terms.items():
outpath = os.path.join(workdir, outfile)
run_gmx(
["gmx", "energy", "-f", edr_file, "-o", outpath],
stdin_input=stdin, workdir=workdir
)
if os.path.exists(outpath):
data, labels = parse_xvg(outpath)
results[term_name] = {
"time": data[:, 0],
"values": data[:, 1],
"label": labels.get("y", term_name),
}
mean_val = data[:, 1].mean()
print(f"{term_name}: mean = {mean_val:.2f}")
return results
RMSD analysis
def run_rmsd_analysis(tpr, xtc, outfile="rmsd.xvg", group="Backbone", workdir="."):
"""Calculate backbone RMSD and return time series arrays."""
run_gmx(
["gmx", "rms", "-s", tpr, "-f", xtc,
"-o", os.path.join(workdir, outfile), "-tu", "ns"],
stdin_input=f"{group}\n{group}\n",
workdir=workdir
)
data, _ = parse_xvg(os.path.join(workdir, outfile))
times = data[:, 0] # ns (with -tu ns flag)
rmsd_nm = data[:, 1]
rmsd_a = rmsd_nm * 10 # nm → Å
print(f"RMSD: {rmsd_a.mean():.2f} ± {rmsd_a.std():.2f} Å "
f"(max {rmsd_a.max():.2f} Å at {times[rmsd_a.argmax()]:.1f} ns)")
return times, rmsd_a
RMSF analysis
def run_rmsf_analysis(tpr, xtc, outfile="rmsf.xvg", group="Backbone", workdir="."):
"""Calculate per-residue RMSF and return residue and RMSF arrays."""
run_gmx(
["gmx", "rmsf", "-s", tpr, "-f", xtc,
"-o", os.path.join(workdir, outfile), "-res"],
stdin_input=f"{group}\n",
workdir=workdir
)
data, _ = parse_xvg(os.path.join(workdir, outfile))
resids = data[:, 0].astype(int) # residue numbers
rmsf_nm = data[:, 1]
rmsf_a = rmsf_nm * 10 # nm → Å
top_flex = resids[rmsf_a > rmsf_a.mean() + 2 * rmsf_a.std()]
print(f"Mean RMSF: {rmsf_a.mean():.2f} Å")
print(f"Highly flexible residues (mean + 2σ): {top_flex.tolist()}")
return resids, rmsf_a
Multi-panel publication plots
import matplotlib.pyplot as plt
def plot_analysis(energy_results, rmsd_times, rmsd_vals,
resids, rmsf_vals, outfile="gromacs_analysis.png"):
fig, axes = plt.subplots(2, 2, figsize=(13, 8))
axes = axes.flatten()
# Panel A: Potential energy
if "potential" in energy_results:
e = energy_results["potential"]
axes[0].plot(e["time"], e["values"], color="#534ab7", lw=0.8)
axes[0].set_xlabel("Time (ps)"); axes[0].set_ylabel("Potential energy (kJ/mol)")
axes[0].set_title("(A) Potential energy", fontweight="bold", loc="left")
# Panel B: Temperature
if "temperature" in energy_results:
t = energy_results["temperature"]
axes[1].plot(t["time"], t["values"], color="#ba7517", lw=0.8)
axes[1].axhline(300, color="gray", ls="--", lw=0.7, label="300 K target")
axes[1].set_xlabel("Time (ps)"); axes[1].set_ylabel("Temperature (K)")
axes[1].set_title("(B) Temperature", fontweight="bold", loc="left")
axes[1].legend(fontsize=9)
# Panel C: RMSD
axes[2].plot(rmsd_times, rmsd_vals, color="#0d6e54", lw=1.1)
axes[2].axhline(rmsd_vals[len(rmsd_vals)//2:].mean(),
color="#0d6e54", ls="--", lw=0.7,
label=f"Eq. mean: {rmsd_vals[len(rmsd_vals)//2:].mean():.2f} Å")
axes[2].set_xlabel("Time (ns)"); axes[2].set_ylabel("Backbone RMSD (Å)")
axes[2].set_title("(C) Backbone RMSD", fontweight="bold", loc="left")
axes[2].legend(fontsize=9)
# Panel D: RMSF
axes[3].plot(resids, rmsf_vals, color="#185fa5", lw=1.0)
axes[3].fill_between(resids, rmsf_vals, alpha=0.15, color="#185fa5")
axes[3].axhline(rmsf_vals.mean() + rmsf_vals.std(),
color="#ba7517", ls="--", lw=0.7, label="Mean + 1σ")
axes[3].set_xlabel("Residue"); axes[3].set_ylabel("RMSF (Å)")
axes[3].set_title("(D) Per-residue RMSF", fontweight="bold", loc="left")
axes[3].legend(fontsize=9)
plt.tight_layout()
plt.savefig(outfile, dpi=300, bbox_inches="tight")
plt.close()
print(f"Saved {outfile}")
Complete reusable pipeline
#!/usr/bin/env python3
"""
GROMACS simulation analysis pipeline.
Usage: python analyze_simulation.py
Edit the CONFIGURATION section for your simulation.
"""
import subprocess, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ── CONFIGURATION ─────────────────────────────────────────────────────
SIM_DIR = "./simulation" # directory containing GROMACS output files
TPR_FILE = "md.tpr"
XTC_FILE = "md_vis.xtc" # PBC-corrected trajectory
EDR_FILE = "md.edr"
OUT_DIR = "./analysis"
GROUP = "Backbone" # GROMACS group for RMSD/RMSF
# ──────────────────────────────────────────────────────────────────────
os.makedirs(OUT_DIR, exist_ok=True)
def run_gmx(cmd, stdin=None):
r = subprocess.run(cmd, input=stdin, capture_output=True,
text=True, cwd=SIM_DIR)
if r.returncode != 0:
print(f"Warning: {' '.join(cmd[:3])} returned {r.returncode}")
return r
def parse_xvg(path):
rows = []
with open(path) as f:
for line in f:
if line.startswith(("@", "#")): continue
try: rows.append([float(x) for x in line.split()])
except ValueError: pass
return np.array(rows)
# 1. Energy terms
for term, xvg in [("Potential", "potential.xvg"),
("Temperature", "temperature.xvg"),
("Pressure", "pressure.xvg")]:
run_gmx(["gmx", "energy", "-f", EDR_FILE,
"-o", os.path.join(OUT_DIR, xvg)],
stdin=f"{term}\n0\n")
# 2. RMSD
run_gmx(["gmx", "rms", "-s", TPR_FILE, "-f", XTC_FILE,
"-o", os.path.join(OUT_DIR, "rmsd.xvg"), "-tu", "ns"],
stdin=f"{GROUP}\n{GROUP}\n")
# 3. RMSF
run_gmx(["gmx", "rmsf", "-s", TPR_FILE, "-f", XTC_FILE,
"-o", os.path.join(OUT_DIR, "rmsf.xvg"), "-res"],
stdin=f"{GROUP}\n")
# 4. Parse and plot
rmsd_d = parse_xvg(os.path.join(OUT_DIR, "rmsd.xvg"))
rmsf_d = parse_xvg(os.path.join(OUT_DIR, "rmsf.xvg"))
pot_d = parse_xvg(os.path.join(OUT_DIR, "potential.xvg"))
tmp_d = parse_xvg(os.path.join(OUT_DIR, "temperature.xvg"))
rmsd_a = rmsd_d[:, 1] * 10 # nm → Å
rmsf_a = rmsf_d[:, 1] * 10
fig, axes = plt.subplots(2, 2, figsize=(13, 8))
axes[0].plot(pot_d[:, 0], pot_d[:, 1], "#534ab7", lw=0.8)
axes[0].set(xlabel="Time (ps)", ylabel="Potential energy (kJ/mol)",
title="(A) Potential energy")
axes[1].plot(tmp_d[:, 0], tmp_d[:, 1], "#ba7517", lw=0.8)
axes[1].axhline(300, color="gray", ls="--", lw=0.7)
axes[1].set(xlabel="Time (ps)", ylabel="Temperature (K)",
title="(B) Temperature")
axes[2].plot(rmsd_d[:, 0], rmsd_a, "#0d6e54", lw=1.1)
axes[2].set(xlabel="Time (ns)", ylabel="Backbone RMSD (Å)",
title="(C) Backbone RMSD")
axes[3].plot(rmsf_d[:, 0], rmsf_a, "#185fa5", lw=1.0)
axes[3].fill_between(rmsf_d[:, 0], rmsf_a, alpha=0.15, color="#185fa5")
axes[3].set(xlabel="Residue", ylabel="RMSF (Å)",
title="(D) Per-residue RMSF")
for ax in axes: ax.title.set_fontweight("bold")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "analysis_summary.png"), dpi=300, bbox_inches="tight")
plt.close()
# Export summary CSV
pd.DataFrame({"resid": rmsf_d[:, 0].astype(int), "rmsf_A": rmsf_a}).to_csv(
os.path.join(OUT_DIR, "rmsf_summary.csv"), index=False)
print(f"\nAnalysis complete.")
print(f"Mean RMSD: {rmsd_a.mean():.2f} ± {rmsd_a.std():.2f} Å")
print(f"Mean RMSF: {rmsf_a.mean():.2f} Å")
python analyze_simulation.py at the end of your run.sh — the analysis figures and CSV are generated automatically while you sleep, ready to inspect the next morning. This makes every simulation self-documenting from the moment it finishes.
GROMACS Python automation in one paragraph
Use Python’s subprocess.run() with input=stdin_string to pass group selections to interactive gmx commands without manual intervention. Parse XVG output files by skipping lines starting with @ or # and loading the rest as NumPy arrays — always multiply nm values by 10 to get ångströms. Wrap each gmx command (energy, rms, rmsf) in a function that runs the command and returns the parsed arrays. Combine the functions into a pipeline script with a configuration section at the top — set the file paths once, run the script, and get all four standard analysis panels (potential energy, temperature, RMSD, RMSF) as a single publication-ready figure plus CSV exports. Add the script to your SLURM job to generate analysis automatically when the simulation finishes.