AI prompt injection detection

Prompt injection attacks trick AI models into ignoring their instructions — users paste in jailbreaks like “DAN” prompts, role-play escapes, or instruction overrides designed to bypass your system prompt and extract restricted information or cause your AI to behave in unintended ways.

Arcjet prompt injection detection evaluates each incoming message for injection patterns inside your application before it reaches the AI provider. Detected attacks are blocked before the AI call is made, protecting both your application behavior and your AI budget.

Get started

Protect a production chat endpoint

A production chat endpoint needs more than one guardrail. Some requests contain hostile instructions designed to override your system prompt. Others may be legitimate user requests that still contain sensitive data you do not want entering model context. And like any other public route, AI endpoints still need protection from common web attacks.

Combining Arcjet rules gives you layered enforcement before the model runs:

Shield blocks common web attacks against the endpoint
Prompt injection detection catches hostile instructions before inference
Sensitive information detection prevents PII from entering model context

The following example uses the Vercel AI SDK (JS) / LangChain (Python):

JavaScript / TypeScript
Python

1
import { openai } from "@ai-sdk/openai";
2
import arcjet, {
3
  detectPromptInjection,
4
  sensitiveInfo,
5
  shield,
6
} from "@arcjet/next";
7
import type { UIMessage } from "ai";
8
import { convertToModelMessages, isTextUIPart, streamText } from "ai";
9

10
const aj = arcjet({
11
  key: process.env.ARCJET_KEY!, // Get your site key from https://app.arcjet.com
12
  rules: [
13
    // Shield protects against common web attacks e.g. SQL injection
14
    shield({ mode: "LIVE" }),
15

16
    // Detect prompt injection attacks before they reach your AI model
17
    detectPromptInjection({
18
      mode: "LIVE",
19
    }),
20

21
    // Block sensitive data from entering model context
22
    sensitiveInfo({
23
      mode: "LIVE",
24
      // Block PII types that should never appear in AI prompts.
25
      // Remove types your app legitimately handles (e.g. EMAIL for a support bot).
26
      deny: ["CREDIT_CARD_NUMBER", "EMAIL"],
27
    }),
28
  ],
29
});
30

31
export async function POST(req: Request) {
32
  const { messages }: { messages: UIMessage[] } = await req.json();
33

34
  // Check the most recent user message.
35
  // Pass the full conversation if you want to scan all messages.
36
  const lastMessage: string = (messages.at(-1)?.parts ?? [])
37
    .filter(isTextUIPart)
38
    .map((p) => p.text)
39
    .join(" ");
40

41
  const decision = await aj.protect(req, {
42
    detectPromptInjectionMessage: lastMessage,
43
    sensitiveInfoValue: lastMessage,
44
  });
45

46
  if (decision.isDenied()) {
47
    if (decision.reason.isPromptInjection()) {
48
      console.warn("Request blocked due to prompt injection");
49
      return new Response(
50
        "Prompt injection detected — please rephrase your message",
51
        { status: 403 },
52
      );
53
    }
54

55
    if (decision.reason.isSensitiveInfo()) {
56
      console.warn("Request blocked due to sensitive information");
57
      return new Response(
58
        "Sensitive information detected — please remove it from your prompt",
59
        { status: 400 },
60
      );
61
    }
62

63
    return new Response("Forbidden", { status: 403 });
64
  }
65

66
  // Arcjet approved — call your AI provider
67
  const result = await streamText({
68
    model: openai("gpt-4o"),
69
    messages: await convertToModelMessages(messages),
70
  });
71

72
  return result.toUIMessageStreamResponse();
73
}

import logging
import os

from arcjet import (
    Mode,
    SensitiveInfoEntityType,
    arcjet,
    detect_prompt_injection,
    detect_sensitive_info,
    shield,
)
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel

app = FastAPI()
logger = logging.getLogger(__name__)

llm = ChatOpenAI(model="gpt-4o", api_key=os.environ["OPENAI_API_KEY"])
chain = ChatPromptTemplate.from_messages(
    [("system", "You are a helpful assistant."), ("human", "{message}")]
) | llm | StrOutputParser()


class ChatRequest(BaseModel):
    message: str


aj = arcjet(
    key=os.environ["ARCJET_KEY"],  # Get your site key from https://app.arcjet.com
    rules=[
        # Shield protects against common web attacks e.g. SQL injection
        shield(mode=Mode.LIVE),
        # Detect prompt injection attacks before they reach your AI model
        detect_prompt_injection(mode=Mode.LIVE),
        # Block sensitive data from entering model context
        detect_sensitive_info(
            mode=Mode.LIVE,
            # Block PII types that should never appear in AI prompts.
            # Remove types your app legitimately handles (e.g. EMAIL for a
            # support bot).
            deny=[
                SensitiveInfoEntityType.CREDIT_CARD_NUMBER,
                SensitiveInfoEntityType.EMAIL,
            ],
        ),
    ],
)


@app.post("/chat")
async def chat(request: Request, body: ChatRequest):
    # Pass the user message to both scanners. Pass the full conversation if
    # you want to scan all messages.
    decision = await aj.protect(
        request,
        detect_prompt_injection_message=body.message,
        sensitive_info_value=body.message,
    )

    if decision.is_denied():
        if decision.reason_v2.type == "PROMPT_INJECTION":
            logger.warning("Request blocked due to prompt injection")
            return JSONResponse(
                {"error": "Prompt injection detected — please rephrase your message"},
                status_code=400,
            )
        if decision.reason_v2.type == "SENSITIVE_INFO":
            logger.warning("Request blocked due to sensitive information")
            return JSONResponse(
                {"error": "Sensitive information detected — please remove it from your prompt"},
                status_code=400,
            )
        return JSONResponse({"error": "Forbidden"}, status_code=403)

    # Arcjet approved — call your AI provider
    reply = await chain.ainvoke({"message": body.message})
    return {"reply": reply}

Keep denied responses generic — do not leak detector details or explain exactly what was flagged. A simple message asking the user to rephrase is the right default.

Configuring prompt injection detection

detectPromptInjectionMessage - the text to evaluate. Pass the user’s most recent message, or the full conversation history if you want to scan all messages.

mode: "DRY_RUN" - logs detections without blocking. Use this to measure the false-positive rate in production before switching to "LIVE".

Combine with abuse protection

Prompt injection detection controls what your AI model receives. To also block automated clients and enforce per-user budgets, combine it with AI abuse protection and AI budget control.

Prompt injection is one class of AI abuse. Automated traffic is another — if you expose a public AI endpoint, attackers can drive up costs with automated traffic without needing to bypass your system prompt. Bot detection composes cleanly with prompt injection protection:

JavaScript / TypeScript
Python

1
import arcjet, {
2
  detectBot,
3
  detectPromptInjection,
4
  sensitiveInfo,
5
  shield,
6
} from "@arcjet/next";
7

8
const aj = arcjet({
9
  key: process.env.ARCJET_KEY!,
10
  rules: [
11
    shield({ mode: "LIVE" }),
12
    detectBot({ mode: "LIVE", allow: [] }),
13
    detectPromptInjection({ mode: "LIVE" }),
14
    sensitiveInfo({
15
      mode: "LIVE",
16
      deny: ["CREDIT_CARD_NUMBER", "EMAIL"],
17
    }),
18
  ],
19
});

import os
from arcjet import (
    Mode,
    SensitiveInfoEntityType,
    arcjet,
    detect_bot,
    detect_prompt_injection,
    detect_sensitive_info,
    shield,
)

aj = arcjet(
    key=os.environ["ARCJET_KEY"],
    rules=[
        shield(mode=Mode.LIVE),
        detect_bot(mode=Mode.LIVE, allow=[]),
        detect_prompt_injection(mode=Mode.LIVE),
        detect_sensitive_info(
            mode=Mode.LIVE,
            deny=[
                SensitiveInfoEntityType.CREDIT_CARD_NUMBER,
                SensitiveInfoEntityType.EMAIL,
            ],
        ),
    ],
)

Protecting tool calls

The examples above protect an HTTP endpoint — the boundary where user input enters your application. But prompt injection can also arrive through tool results: a fetch tool retrieves a page that contains injected instructions, which then re-enter the model context when the tool result is passed back.

Arcjet Guards runs the same prompt injection detection inside tool handlers, without a Request object. Install the skill to add it automatically:

npx skills add arcjet/skills

Or wire it up manually — create a guard client once at module scope, then call .guard() inline in each tool handler:

JavaScript / TypeScript
Python

1
import { launchArcjet, detectPromptInjection } from "@arcjet/guard";
2

3
const arcjet = launchArcjet({ key: process.env.ARCJET_KEY! });
4

5
const piRule = detectPromptInjection();
6

7
// Inside your fetch tool handler
8
export async function fetchTool({ url }: { url: string }, userId: string) {
9
  const content = await fetch(url).then((r) => r.text());
10

11
  const decision = await arcjet.guard({
12
    label: "tools.fetch",
13
    metadata: { userId },
14
    rules: [piRule(content)],
15
  });
16

17
  if (decision.conclusion === "DENY") {
18
    // Return a safe placeholder rather than the injected content
19
    return { content: "[Content blocked: prompt injection detected]" };
20
  }
21

22
  return { content };
23
}

import os
import httpx
from arcjet.guard import DetectPromptInjection, launch_arcjet

arcjet = launch_arcjet(key=os.environ["ARCJET_KEY"])

pi_rule = DetectPromptInjection()


# Inside your fetch tool handler
async def fetch_tool(url: str, user_id: str) -> dict:
    async with httpx.AsyncClient() as client:
        content = (await client.get(url)).text

    decision = await arcjet.guard(
        label="tools.fetch",
        metadata={"user_id": user_id},
        rules=[pi_rule(content)],
    )

    if decision.conclusion == "DENY":
        # Return a safe placeholder rather than the injected content
        return {"content": "[Content blocked: prompt injection detected]"}

    return {"content": content}

Get started

Protect a production chat endpoint

Configuring prompt injection detection

Combine with abuse protection

Protecting tool calls

Learn more