{
  "$schema": "https://humangarden.ai/.well-known/skill-spec.schema.json",
  "canonical_url": "https://humangarden.ai/skills/pdf/spec.json",
  "slug": "pdf",
  "last_tested": "2026-06-05",
  "tested_by": "humangarden",
  "verdict_one_line": "pypdf text extraction works clean on born-digital PDFs; pdfplumber tables silently return false positives on academic layouts — do not trust the fallback path",
  "fires_when": [
    "user has a .pdf file and wants its text content",
    "user wants to convert PDF prose to markdown or plain text",
    "user mentions extracting from a born-digital PDF (report, contract, invoice, paper)",
    "user wants PDF metadata (author, dates, producer, creation tool)",
    "user wants to merge, split, rotate, or watermark PDF pages"
  ],
  "skip_when": [
    "user wants to extract structured data from academic papers with figure-rendered tables",
    "user has scanned PDFs and needs OCR (this skill mentions tesseract but does not bundle it)",
    "user wants pixel-accurate layout preservation",
    "user wants to fill PDF form fields (different code path — see scripts/ folder, not the inline snippets)"
  ],
  "inputs": [
    {
      "type": "file",
      "format": "pdf",
      "constraint": "born-digital strongly preferred; scanned PDFs require additional OCR setup"
    },
    {
      "type": "file",
      "format": "pdf",
      "constraint": "for form-filling, use the scripts/ folder, not the SKILL.md snippets"
    }
  ],
  "outputs": [
    {
      "type": "text",
      "format": "plain text or markdown",
      "quality_note": "prose ≈95% accurate on born-digital; pypdf occasionally injects spaces inside acronyms (e.g. \"SWA\" → \"SW A\")"
    },
    {
      "type": "structured-data",
      "format": "list-of-rows",
      "quality_note": "0% recall on figure-rendered tables in academic papers; false-positive prone if text-strategy fallback is used"
    }
  ],
  "installation": {
    "pip": [
      "pypdf",
      "pdfplumber"
    ],
    "npm": [],
    "system": {
      "macos": "optional brew install poppler (for pdftotext CLI — skill does not flag as required)"
    },
    "notes": "No requirements.txt ships with the skill; the inline scripts in SKILL.md are copy-paste, not importable."
  },
  "artifacts": [
    {
      "kind": "image",
      "file": "https://humangarden.ai/spec-artifacts/pdf/arxiv-2310.06825-page1.png",
      "caption": "Page 1 of the Mistral 7B paper (arXiv 2310.06825, 9 pages, 3.7 MB)",
      "role": "input",
      "hero": true
    },
    {
      "kind": "text",
      "inline": "Pages: 9\nTotal chars: 24815\n---FIRST 1500 CHARS---\nMistral 7B\nAlbert Q. Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford,\nDevendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel,\nGuillaume Lample, Lucile Saulnier, Lélio Renard Lavaud, Marie-Anne Lachaux,\n...\nWe introduce Mistral 7B, a 7-billion-parameter language model engineered for\nsuperior performance and efficiency. Mistral 7B outperforms the best open 13B\nmodel (Llama 2) across all evaluated benchmarks...\n",
      "caption": "pypdf extracted 24,815 chars in ~2 seconds. Prose clean; one artifact: \"SW A\" instead of \"SWA\".",
      "role": "output",
      "hero": true
    },
    {
      "kind": "pdf",
      "file": "https://humangarden.ai/spec-artifacts/pdf/arxiv-2310.06825.pdf",
      "caption": "The full input PDF",
      "role": "input",
      "hero": false
    }
  ],
  "caveats": [
    "pdfplumber default extract_tables() returns 0 on academic papers (silent failure)",
    "pdfplumber text-strategy fallback returns confident-looking garbage that will poison downstream pipelines",
    "pypdf inserts spurious whitespace inside acronyms in some PDFs",
    "the eight scripts in /scripts are all form-handling; not relevant to text extraction"
  ],
  "needs_credentials": [],
  "source_repo": "https://github.com/anthropics/skills/tree/main/pdf",
  "human_review_url": "https://humangarden.ai/skills/pdf/"
}