How to Parse AutoDock Vina Docking Results with Python: From PDBQT to Ranked Hit List
A virtual screening run produces hundreds or thousands of PDBQT output files. Manually opening each one to copy binding affinities is impractical from the first compound. This tutorial builds a Python pipeline that reads every output file, extracts all binding modes, loads the results into pandas, and exports a ranked hit list ready for follow-up analysis.
The PDBQT output format — where scores live
AutoDock Vina writes one PDBQT file per ligand. Inside, each binding mode (pose) starts with a REMARK VINA RESULT line containing three values: the binding affinity in kcal/mol, the RMSD from the best pose, and the RMSD from the reference pose. The best pose for each ligand is always the first one listed.
MODEL 1
ATOM 1 C LIG A 1 …
ENDMDL
REMARK VINA RESULT: -8.7 1.243 1.243
MODEL 2
…
REMARK VINA RESULT: -8.1 2.018 2.018
MODEL 3
The three numbers after REMARK VINA RESULT: are: affinity (kcal/mol, more negative = better), RMSD lower bound (Å from best pose), and RMSD upper bound (Å from best pose). For most virtual screening purposes, you only need the affinity of the first (best) pose.
affinity < threshold where threshold is a negative number like −8.0.
Parsing a single PDBQT file
def parse_vina_pdbqt(filepath, all_modes=False):
"""
Parse a Vina PDBQT output file.
all_modes=False: return only the best affinity (float or None)
all_modes=True: return list of dicts with affinity, rmsd_lb, rmsd_ub
"""
modes = []
with open(filepath) as f:
for line in f:
if line.startswith("REMARK VINA RESULT:"):
parts = line.split()
# parts: ['REMARK', 'VINA', 'RESULT:', affinity, rmsd_lb, rmsd_ub]
modes.append({
"affinity": float(parts[3]),
"rmsd_lb": float(parts[4]),
"rmsd_ub": float(parts[5]),
})
if not modes:
return None if not all_modes else []
return modes if all_modes else modes[0]["affinity"]
# Test on a single file
best_affinity = parse_vina_pdbqt("compound_001_out.pdbqt")
print(f"Best affinity: {best_affinity} kcal/mol")
# Best affinity: -9.2 kcal/mol
# Get all binding modes
all_modes = parse_vina_pdbqt("compound_001_out.pdbqt", all_modes=True)
for i, mode in enumerate(all_modes, 1):
print(f"Mode {i}: {mode['affinity']:.1f} kcal/mol (RMSD: {mode['rmsd_lb']:.2f} Å)")
Batch parsing a screening directory
import glob
import os
def parse_screening_results(output_dir, pattern="*_out.pdbqt"):
"""
Parse all Vina output PDBQT files in a directory.
Returns a list of result dicts.
"""
pdbqt_files = sorted(glob.glob(os.path.join(output_dir, pattern)))
if not pdbqt_files:
raise FileNotFoundError(
f"No files matching '{pattern}' found in {output_dir}")
print(f"Found {len(pdbqt_files)} output files — parsing...")
results = []
failed = []
for filepath in pdbqt_files:
filename = os.path.basename(filepath)
# Derive compound name: "compound_001_out.pdbqt" → "compound_001"
compound = filename.replace("_out.pdbqt", "").replace(".pdbqt", "")
modes = parse_vina_pdbqt(filepath, all_modes=True)
if not modes:
failed.append(compound)
continue
results.append({
"compound": compound,
"best_affinity": modes[0]["affinity"],
"n_modes": len(modes),
"affinity_2": modes[1]["affinity"] if len(modes) > 1 else None,
"affinity_3": modes[2]["affinity"] if len(modes) > 2 else None,
"rmsd_lb_2": modes[1]["rmsd_lb"] if len(modes) > 1 else None,
"filepath": filepath,
})
print(f"Parsed: {len(results)} OK, {len(failed)} failed")
if failed:
print(f"Failed: {failed[:5]}{'…' if len(failed)>5 else ''}")
return results
results = parse_screening_results("./docking_output/")
# Found 1000 output files — parsing...
# Parsed: 997 OK, 3 failed
Loading results into pandas
import pandas as pd
df = pd.DataFrame(results)
df = df.sort_values("best_affinity").reset_index(drop=True) # ascending = best first
df["rank"] = df.index + 1 # 1-based rank column
print(df[["rank", "compound", "best_affinity", "n_modes"]].head(10).to_string(index=False))
print(f"\nTotal compounds: {len(df)}")
print(f"Best score: {df['best_affinity'].min():.2f} kcal/mol ({df.iloc[0]['compound']})")
print(f"Score range: {df['best_affinity'].min():.2f} to {df['best_affinity'].max():.2f} kcal/mol")
print(f"Mean score: {df['best_affinity'].mean():.2f} ± {df['best_affinity'].std():.2f} kcal/mol")
Sorting, filtering, and ranking hits
The choice of affinity cutoff for “hits” depends on your target and the size of your library. Common thresholds for protein targets:
# Filter by affinity threshold
hits = df[df["best_affinity"] <= -8.0].copy()
print(f"Hits below -8 kcal/mol: {len(hits)} ({len(hits)/len(df)*100:.1f}%)")
# Top N percentile filter
top_5pct_cutoff = df["best_affinity"].quantile(0.05) # lowest 5%
top_hits = df[df["best_affinity"] <= top_5pct_cutoff]
print(f"Top 5% cutoff: {top_5pct_cutoff:.2f} kcal/mol ({len(top_hits)} compounds)")
# Filter by score AND pose diversity
# Keep only hits with at least 2 modes (more confident pose)
confident_hits = hits[hits["n_modes"] >= 2].copy()
# Add score gap between best and second best pose
# A small gap (< 1 kcal/mol) suggests a well-defined binding mode
confident_hits["score_gap"] = (confident_hits["affinity_2"] -
confident_hits["best_affinity"])
well_defined = confident_hits[confident_hits["score_gap"] < 1.0]
print(f"Well-defined hits (gap < 1 kcal/mol): {len(well_defined)}")
Exporting the ranked results table
# Export full results ranked by affinity
export_cols = ["rank", "compound", "best_affinity",
"affinity_2", "affinity_3", "n_modes", "filepath"]
df[export_cols].to_csv("all_results_ranked.csv", index=False)
# Export top hits only
hits[["rank", "compound", "best_affinity", "n_modes"]].to_csv(
"hits_below_minus8.csv", index=False)
# Export as Excel with formatting
with pd.ExcelWriter("screening_results.xlsx", engine="openpyxl") as writer:
df[export_cols].head(100).to_excel(writer, sheet_name="Top 100", index=False)
hits[export_cols].to_excel(writer, sheet_name="Hits < -8", index=False)
print("Exported: all_results_ranked.csv, hits_below_minus8.csv, screening_results.xlsx")
Plotting score distributions
import matplotlib.pyplot as plt
import numpy as np
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Panel A: Score distribution histogram
ax1.hist(df["best_affinity"], bins=40, color="#185fa5",
edgecolor="white", linewidth=0.5, alpha=0.85)
ax1.axvline(-8.0, color="#ba7517", linestyle="--", linewidth=1.2,
label="-8.0 kcal/mol threshold")
ax1.axvline(df["best_affinity"].mean(), color="#0d6e54",
linestyle=":", linewidth=1.2,
label=f"Mean: {df['best_affinity'].mean():.2f} kcal/mol")
ax1.set_xlabel("Binding affinity (kcal/mol)")
ax1.set_ylabel("Number of compounds")
ax1.set_title("(A) Score distribution — full library",
fontweight="bold", loc="left")
ax1.legend(fontsize=9)
# Panel B: Cumulative fraction below each score
scores_sorted = np.sort(df["best_affinity"])
cumfrac = np.arange(1, len(scores_sorted) + 1) / len(scores_sorted)
ax2.plot(scores_sorted, cumfrac * 100, color="#185fa5", linewidth=1.5)
ax2.axvline(-8.0, color="#ba7517", linestyle="--", linewidth=1.0)
pct_below = (df["best_affinity"] <= -8.0).mean() * 100
ax2.axhline(pct_below, color="#ba7517", linestyle=":", linewidth=0.8,
label=f"{pct_below:.1f}% hit rate at -8 kcal/mol")
ax2.set_xlabel("Binding affinity (kcal/mol)")
ax2.set_ylabel("Cumulative % of library")
ax2.set_title("(B) Cumulative hit rate", fontweight="bold", loc="left")
ax2.legend(fontsize=9)
plt.tight_layout()
plt.savefig("score_distribution.png", dpi=300, bbox_inches="tight")
plt.close()
print("Saved score_distribution.png")
Complete pipeline script
#!/usr/bin/env python3
import glob, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
OUTPUT_DIR = "./docking_output" # directory of *_out.pdbqt files
RESULTS_DIR = "./screening_results"
HIT_CUTOFF = -8.0 # kcal/mol
os.makedirs(RESULTS_DIR, exist_ok=True)
def parse_vina_pdbqt(filepath):
modes = []
with open(filepath) as f:
for line in f:
if line.startswith("REMARK VINA RESULT:"):
p = line.split()
modes.append({"affinity": float(p[3]),
"rmsd_lb": float(p[4]),
"rmsd_ub": float(p[5])})
return modes
rows = []
for fp in sorted(glob.glob(os.path.join(OUTPUT_DIR, "*_out.pdbqt"))):
name = os.path.basename(fp).replace("_out.pdbqt", "")
modes = parse_vina_pdbqt(fp)
if not modes: continue
rows.append({
"compound": name,
"best_affinity": modes[0]["affinity"],
"n_modes": len(modes),
"affinity_2": modes[1]["affinity"] if len(modes) > 1 else None,
"score_gap": (modes[1]["affinity"] - modes[0]["affinity"])
if len(modes) > 1 else None,
"filepath": fp,
})
df = pd.DataFrame(rows).sort_values("best_affinity").reset_index(drop=True)
df["rank"] = df.index + 1
hits = df[df["best_affinity"] <= HIT_CUTOFF]
# Export
df.to_csv(os.path.join(RESULTS_DIR, "all_ranked.csv"), index=False)
hits.to_csv(os.path.join(RESULTS_DIR, "hits.csv"), index=False)
# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.hist(df["best_affinity"], bins=40, color="#185fa5", edgecolor="white")
ax1.axvline(HIT_CUTOFF, color="#ba7517", ls="--",
label=f"{HIT_CUTOFF} kcal/mol ({len(hits)} hits)")
ax1.set(xlabel="Binding affinity (kcal/mol)", ylabel="Count",
title="Score distribution")
ax1.legend(fontsize=9)
ax2.scatter(df.index, df["best_affinity"], s=4, alpha=0.4, color="#185fa5")
ax2.axhline(HIT_CUTOFF, color="#ba7517", ls="--")
ax2.set(xlabel="Rank", ylabel="Binding affinity (kcal/mol)",
title="Score by rank")
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, "score_plots.png"), dpi=300)
plt.close()
print(f"Library: {len(df)} compounds")
print(f"Hits (≤{HIT_CUTOFF}): {len(hits)} ({len(hits)/len(df)*100:.1f}%)")
print(f"Best: {df.iloc[0]['compound']} at {df.iloc[0]['best_affinity']:.2f} kcal/mol")
print(f"Saved results to {RESULTS_DIR}/")
Vina result parsing in one paragraph
Each Vina output PDBQT contains REMARK VINA RESULT: lines — one per binding mode — with the affinity in kcal/mol as the fourth space-separated token. Read every *_out.pdbqt file with glob, extract the first (best) affinity from each, and collect into a list of dictionaries. Load into pandas, sort ascending by best_affinity, add a rank column, and filter with df[df["best_affinity"] <= -8.0] for hits. Export to CSV with to_csv(). Plot the score histogram with a threshold line to visualize the hit rate. Scores are estimates — always inspect top hits in PyMOL before committing to experimental follow-up. The whole pipeline from directory of PDBQT files to ranked CSV and plot takes less than 30 seconds for a thousand-compound screen.