import re, random, subprocess, anthropic
client = anthropic.Anthropic()
def propose_child(parent_code: str, parent_score: float) -> str | None:
resp = client.messages.create(
model="claude-3-5-sonnet-20241022", # frozen foundation model
max_tokens=8192,
messages=[{
"role": "user",
"content": (
f"You are a self-improving coding agent.\n\n{parent_code}\n\n"
f"Current benchmark score: {parent_score:.3f}\n\n"
"Make ONE targeted improvement to your tool use, error handling, "
"or context management. Return ONLY the complete revised Python "
"file in a single ```python code block. No prose."
),
}],
)
text = "".join(b.text for b in resp.content if b.type == "text")
m = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
return m.group(1).strip() if m else None # skip malformed generations
def run_benchmark(code: str) -> float:
# Run untrusted, self-generated code in an isolated sandbox — never the host.
with open("/sandbox/agent_candidate.py", "w") as f:
f.write(code)
result = subprocess.run(
["python", "evaluate.py", "--agent", "/sandbox/agent_candidate.py"],
capture_output=True, text=True, timeout=1800,
)
try:
return float(result.stdout.strip().splitlines()[-1])
except (ValueError, IndexError):
return 0.0
def select_parent(archive: list[dict]) -> dict:
# Favor high scores AND under-explored agents — so weak ancestors survive.
weights = [a["score"] + 1.0 / (1 + a["children"]) for a in archive]
return random.choices(archive, weights=weights, k=1)[0]
seed = open("agent_v0.py").read()
archive = [{"code": seed, "score": run_benchmark(seed), "children": 0}]
for gen in range(80):
parent = select_parent(archive)
parent["children"] += 1
child = propose_child(parent["code"], parent["score"])
if child is None:
continue
score = run_benchmark(child)
if score >= parent["score"]: # keep improvements
archive.append({"code": child, "score": score, "children": 0})
print(f"gen {gen}: {score:.1%} — kept")
else:
print(f"gen {gen}: {score:.1%} — discarded")
best = max(archive, key=lambda a: a["score"])
print(f"best: {best['score']:.1%} across {len(archive)} agents")
Conclusion: the gain came from evolving the code around a frozen model, not from a bigger model — and from open-ended search over an archive rather than greedy hill-climbing. It's a teaching skeleton, not a reproduction: the real DGM is a full coding agent operating on a repo against the SWE-bench harness, run sandboxed with human oversight.
Your feedback matters: letstalk@venzx.com


