Name: VENZX
Price: 0.01 USD

import re, random, subprocess, anthropic

client = anthropic.Anthropic()

def propose_child(parent_code: str, parent_score: float) -> str | None:
    resp = client.messages.create(
        model="claude-3-5-sonnet-20241022",  # frozen foundation model
        max_tokens=8192,
        messages=[{
            "role": "user",
            "content": (
                f"You are a self-improving coding agent.\n\n{parent_code}\n\n"
                f"Current benchmark score: {parent_score:.3f}\n\n"
                "Make ONE targeted improvement to your tool use, error handling, "
                "or context management. Return ONLY the complete revised Python "
                "file in a single ```python code block. No prose."
            ),
        }],
    )
    text = "".join(b.text for b in resp.content if b.type == "text")
    m = re.search(r"```(?:python)?\s*(.*?)```", text, re.DOTALL)
    return m.group(1).strip() if m else None  # skip malformed generations

def run_benchmark(code: str) -> float:
    # Run untrusted, self-generated code in an isolated sandbox — never the host.
    with open("/sandbox/agent_candidate.py", "w") as f:
        f.write(code)
    result = subprocess.run(
        ["python", "evaluate.py", "--agent", "/sandbox/agent_candidate.py"],
        capture_output=True, text=True, timeout=1800,
    )
    try:
        return float(result.stdout.strip().splitlines()[-1])
    except (ValueError, IndexError):
        return 0.0

def select_parent(archive: list[dict]) -> dict:
    # Favor high scores AND under-explored agents — so weak ancestors survive.
    weights = [a["score"] + 1.0 / (1 + a["children"]) for a in archive]
    return random.choices(archive, weights=weights, k=1)[0]

seed = open("agent_v0.py").read()
archive = [{"code": seed, "score": run_benchmark(seed), "children": 0}]

for gen in range(80):
    parent = select_parent(archive)
    parent["children"] += 1
    child = propose_child(parent["code"], parent["score"])
    if child is None:
        continue
    score = run_benchmark(child)
    if score >= parent["score"]:                       # keep improvements
        archive.append({"code": child, "score": score, "children": 0})
        print(f"gen {gen}: {score:.1%} — kept")
    else:
        print(f"gen {gen}: {score:.1%} — discarded")

best = max(archive, key=lambda a: a["score"])
print(f"best: {best['score']:.1%} across {len(archive)} agents")

Conclusion: the gain came from evolving the code around a frozen model, not from a bigger model — and from open-ended search over an archive rather than greedy hill-climbing. It's a teaching skeleton, not a reproduction: the real DGM is a full coding agent operating on a repo against the SWE-bench harness, run sandboxed with human oversight.

Your feedback matters: letstalk@venzx.com

The Darwin Gödel Machine loop — archive, not hill-climbing

You might also like...

Unleashing the Full Potential of Deep Learning Models: A Guide to Quantization Techniques

The Birth of Criminal ChatGPTs That Can Outthink Us All

A vaccine has been launched for beginners in the AI domain