How to Automate This — AI Output Simulator

Start by navigating to the deployed URL as the first step in your test — for example, using a goto (or GotoAsync) step in Playwright.

The core challenge: AI responses are non-deterministic — the same prompt will produce different (but often equally valid) answers on every run. Exact string matching will cause your tests to fail even when the AI is working correctly. Instead, use a semantic or fuzzy comparison library to assert that the response is close enough to an expected value.

The examples below show how to navigate to this demo, click the button to get a response, and assert that the output is meaningfully similar to a reference answer — rather than requiring an exact match.

Playwright examples

Install: npm install string-similarity @types/string-similarity

// automate-ai-demo.spec.ts
import { test, expect } from '@playwright/test';
import stringSimilarity from 'string-similarity';

test('AI response is semantically similar to expected output', async ({ page }) => {
  // 1. Navigate to the demo
  await page.goto('https://buttered-spuds.github.io/ai-non-determinism-demo/');

  // 2. The prompt is pre-filled; click Run Prompt to submit it
  await page.getByRole('button', { name: 'Run Prompt', exact: true }).click();

  // 3. Wait for and capture the response
  const responseCard = page.getByRole('article').first();
  await expect(responseCard).toBeVisible();
  const responseText = (await responseCard.getByRole('blockquote', { name: 'AI response' }).textContent() ?? '').replace(/^"|"$/g, '');

  // 4. Use fuzzy/semantic comparison instead of an exact match
  const expected = 'Automated testing saves time by catching bugs early and reducing manual effort.';
  const similarity = stringSimilarity.compareTwoStrings(responseText, expected);

  // compareTwoStrings returns 0–1; assert the response is meaningfully similar
  expect(similarity, `Response was not similar enough. Got: "${responseText}"`).toBeGreaterThan(0.5);
});

Install: dotnet add package Microsoft.Playwright.NUnit then dotnet add package FuzzySharp

// AutomateAiDemoTests.cs
using FuzzySharp;
using Microsoft.Playwright.NUnit;
using NUnit.Framework;

[Parallelizable(ParallelScope.Self)]
[TestFixture]
public class AutomateAiDemoTests : PageTest
{
    [Test]
    public async Task AiResponse_IsSemanticallyClose_ToExpected()
    {
        // 1. Navigate to the demo
        await Page.GotoAsync("https://buttered-spuds.github.io/ai-non-determinism-demo/");

        // 2. The prompt is pre-filled; click Run Prompt to submit it
        await Page.GetByRole(AriaRole.Button, new() { Name = "Run Prompt", Exact = true }).ClickAsync();

        // 3. Wait for and capture the response
        var responseCard = Page.GetByRole(AriaRole.Article).First;
        await Expect(responseCard).ToBeVisibleAsync();
        var responseText = (await responseCard.GetByRole(AriaRole.Blockquote, new() { Name = "AI response" }).TextContentAsync() ?? "").Trim('"');

        // 4. Use FuzzySharp for fuzzy comparison instead of an exact match
        const string expected = "Automated testing saves time by catching bugs early and reducing manual effort.";
        int score = Fuzz.TokenSortRatio(responseText, expected);

        // Fuzz.TokenSortRatio returns 0–100; assert the response is meaningfully similar
        Assert.That(score, Is.GreaterThan(50),
            $"Similarity score {score} is below threshold. Got: \"{responseText}\"");
    }
}

Install: pip install playwright rapidfuzz then playwright install

# test_automate_ai_demo.py
import re
from playwright.sync_api import Page, expect
from rapidfuzz import fuzz

def test_ai_response_is_semantically_close(page: Page):
    # 1. Navigate to the demo
    page.goto("https://buttered-spuds.github.io/ai-non-determinism-demo/")

    # 2. The prompt is pre-filled; click Run Prompt to submit it
    page.get_by_role("button", name="Run Prompt", exact=True).click()

    # 3. Wait for and capture the response
    response_card = page.get_by_role("article").first
    expect(response_card).to_be_visible()
    response_text = re.sub(r'^"|"$', "", response_card.get_by_role("blockquote", name="AI response", exact=True).text_content() or "")

    # 4. Use rapidfuzz for fuzzy comparison instead of an exact match
    expected = "Automated testing saves time by catching bugs early and reducing manual effort."
    score = fuzz.token_sort_ratio(response_text, expected)

    # token_sort_ratio returns 0–100; assert the response is meaningfully similar
    assert score > 50, f"Similarity score {score} is below threshold. Got: {response_text!r}"

Install: npm install string-similarity

// automate-ai-demo.spec.js
const { test, expect } = require('@playwright/test');
const stringSimilarity = require('string-similarity');

test('AI response is semantically similar to expected output', async ({ page }) => {
  // 1. Navigate to the demo
  await page.goto('https://buttered-spuds.github.io/ai-non-determinism-demo/');

  // 2. The prompt is pre-filled; click Run Prompt to submit it
  await page.getByRole('button', { name: 'Run Prompt', exact: true }).click();

  // 3. Wait for and capture the response
  const responseCard = page.getByRole('article').first();
  await expect(responseCard).toBeVisible();
  const responseText = ((await responseCard.getByRole('blockquote', { name: 'AI response' }).textContent()) ?? '').replace(/^"|"$/g, '');

  // 4. Use fuzzy/semantic comparison instead of an exact match
  const expected = 'Automated testing saves time by catching bugs early and reducing manual effort.';
  const similarity = stringSimilarity.compareTwoStrings(responseText, expected);

  // compareTwoStrings returns 0–1; assert the response is meaningfully similar
  expect(similarity, `Response was not similar enough. Got: "${responseText}"`).toBeGreaterThan(0.5);
});

Tips for testing AI-powered UIs

▸ Never use exact-text assertions. AI outputs vary by design — use fuzzy or semantic comparison to check that the response is close enough, not identical.
▸ Assert on structure, not content. Check that a response card appeared, that it is non-empty, and that it does not contain error states.
▸ Use statistical thresholds. Run the prompt multiple times and assert that the pass rate meets your quality bar (e.g. ≥ 80 %).
▸ Separate deterministic from non-deterministic checks. UI structure (buttons, layout, labels) can be asserted exactly; AI content cannot.