github
diff --git a/‎containers/api-proxy/token-tracker.js‎
Lines changed: 68 additions & 0 deletions b/‎containers/api-proxy/token-tracker.js‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎containers/api-proxy/token-tracker.test.js‎
Lines changed: 208 additions & 0 deletions b/‎containers/api-proxy/token-tracker.test.js‎
Lines changed: 208 additions & 0 deletions
diff --git a/‎samples/audit/audit.jsonl‎
Lines changed: 3 additions & 3 deletions b/‎samples/audit/audit.jsonl‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎schemas/README.md‎
Lines changed: 40 additions & 0 deletions b/‎schemas/README.md‎
Lines changed: 40 additions & 0 deletions
@@ -84,11 +84,76 @@ function getLogStream() {
   }
 }
 
+/**
+ * Validate a token usage record against the token-usage/v1 schema contract.
+ *
+ * Checks that all required fields are present and have the expected types.
+ * Logs a warning and returns false if the record is non-conformant; does
+ * not throw, so a bad record is dropped rather than crashing the proxy.
+ *
+ * @param {object} record - The record to validate
+ * @returns {boolean} true if the record is valid, false otherwise
+ */
+function validateTokenUsageRecord(record) {
+  if (!record || typeof record !== 'object') {
+    logRequest('warn', 'token_record_schema_violation', {
+      field: 'record',
+      expected: 'object',
+      actual: record === null ? 'null' : typeof record,
+    });
+    return false;
+  }
+
+  const required = [
+    ['_schema', 'string'],
+    ['timestamp', 'string'],
+    ['request_id', 'string'],
+    ['provider', 'string'],
+    ['model', 'string'],
+    ['path', 'string'],
+    ['status', 'number'],
+    ['streaming', 'boolean'],
+    ['input_tokens', 'number'],
+    ['output_tokens', 'number'],
+    ['cache_read_tokens', 'number'],
+    ['cache_write_tokens', 'number'],
+    ['duration_ms', 'number'],
+  ];
+
+  for (const [field, expectedType] of required) {
+    // eslint-disable-next-line valid-typeof
+    if (typeof record[field] !== expectedType) {
+      logRequest('warn', 'token_record_schema_violation', {
+        request_id: record.request_id,
+        field,
+        expected: expectedType,
+        actual: typeof record[field],
+      });
+      return false;
+    }
+  }
+
+  if (record._schema !== 'token-usage/v1') {
+    logRequest('warn', 'token_record_schema_violation', {
+      request_id: record.request_id,
+      field: '_schema',
+      expected: 'token-usage/v1',
+      actual: record._schema,
+    });
+    return false;
+  }
+
+  return true;
+}
+
 /**
  * Write a token usage record to the JSONL log file.
+ * Validates the record against the token-usage/v1 schema before writing.
  * Handles backpressure by dropping writes when the stream buffer is full.
  */
 function writeTokenUsage(record) {
+  if (!validateTokenUsageRecord(record)) return;
+
   const stream = getLogStream();
   if (stream && !stream.writableEnded) {
     const ok = stream.write(JSON.stringify(record) + '\n');
@@ -478,6 +543,7 @@ function trackTokenUsage(proxyRes, opts) {
 
     // Build log record
     const record = {
+      _schema: 'token-usage/v1',
       timestamp: new Date().toISOString(),
       request_id: requestId,
       provider,
@@ -693,6 +759,7 @@ function trackWebSocketTokenUsage(upstreamSocket, opts) {
     }
 
     const record = {
+      _schema: 'token-usage/v1',
       timestamp: new Date().toISOString(),
       request_id: requestId,
       provider,
@@ -759,6 +826,7 @@ module.exports = {
   normalizeUsage,
   isStreamingResponse,
   isCompressedResponse,
+  validateTokenUsageRecord,
   writeTokenUsage,
   TOKEN_LOG_FILE,
 };
@@ -12,6 +12,9 @@ const {
   isCompressedResponse,
   trackTokenUsage,
   trackWebSocketTokenUsage,
+  validateTokenUsageRecord,
+  writeTokenUsage,
+  closeLogStream,
 } = require('./token-tracker');
 const { EventEmitter } = require('events');
 const os = require('os');
@@ -1044,3 +1047,208 @@ describe('trackWebSocketTokenUsage', () => {
     }, 10);
   });
 });
+
+// ── validateTokenUsageRecord ─────────────────────────────────────────
+
+describe('validateTokenUsageRecord', () => {
+  const validRecord = {
+    _schema: 'token-usage/v1',
+    timestamp: '2025-01-01T00:00:00.000Z',
+    request_id: 'req-123',
+    provider: 'anthropic',
+    model: 'claude-sonnet-4-20250514',
+    path: '/v1/messages',
+    status: 200,
+    streaming: false,
+    input_tokens: 100,
+    output_tokens: 50,
+    cache_read_tokens: 0,
+    cache_write_tokens: 0,
+    duration_ms: 1234,
+  };
+
+  test('accepts a valid record', () => {
+    expect(validateTokenUsageRecord(validRecord)).toBe(true);
+  });
+
+  test('accepts a record with optional response_bytes', () => {
+    expect(validateTokenUsageRecord({ ...validRecord, response_bytes: 512 })).toBe(true);
+  });
+
+  test('rejects a record with wrong _schema', () => {
+    expect(validateTokenUsageRecord({ ...validRecord, _schema: 'wrong/v99' })).toBe(false);
+  });
+
+  test('rejects a record missing _schema', () => {
+    const { _schema, ...noSchema } = validRecord;
+    expect(validateTokenUsageRecord(noSchema)).toBe(false);
+  });
+
+  test('rejects a record with non-string timestamp', () => {
+    expect(validateTokenUsageRecord({ ...validRecord, timestamp: 1234567890 })).toBe(false);
+  });
+
+  test('rejects a record with non-number input_tokens', () => {
+    expect(validateTokenUsageRecord({ ...validRecord, input_tokens: '100' })).toBe(false);
+  });
+
+  test('rejects a record with non-boolean streaming', () => {
+    expect(validateTokenUsageRecord({ ...validRecord, streaming: 'true' })).toBe(false);
+  });
+
+  test('rejects a record missing a required field', () => {
+    const { model, ...noModel } = validRecord;
+    expect(validateTokenUsageRecord(noModel)).toBe(false);
+  });
+
+  test('rejects null without throwing', () => {
+    expect(validateTokenUsageRecord(null)).toBe(false);
+  });
+
+  test('rejects undefined without throwing', () => {
+    expect(validateTokenUsageRecord(undefined)).toBe(false);
+  });
+
+  test('rejects a non-object primitive without throwing', () => {
+    expect(validateTokenUsageRecord('not-an-object')).toBe(false);
+    expect(validateTokenUsageRecord(42)).toBe(false);
+  });
+});
+
+// ── JSONL records include _schema field ───────────────────────────────
+
+/**
+ * Build a writable mock stream that captures all written chunks.
+ * The `written` getter parses the accumulated JSONL and returns records.
+ */
+function makeMockStream() {
+  const chunks = [];
+  const stream = {
+    writableEnded: false,
+    write: jest.fn((chunk) => { chunks.push(chunk); return true; }),
+    end: jest.fn((cb) => { stream.writableEnded = true; if (cb) cb(); }),
+    on: jest.fn(),
+    get writtenRecords() {
+      return chunks.map(c => JSON.parse(c.trim()));
+    },
+  };
+  return stream;
+}
+
+describe('token-usage JSONL record schema field', () => {
+  let mockStream;
+  let mkdirSyncSpy;
+  let createWriteStreamSpy;
+
+  beforeEach(async () => {
+    // Close any open log stream so the next getLogStream() call creates a fresh one.
+    await closeLogStream();
+
+    mockStream = makeMockStream();
+
+    // Redirect fs.mkdirSync and fs.createWriteStream so the module writes to our
+    // in-memory stream rather than the unwritable /var/log/api-proxy path.
+    mkdirSyncSpy = jest.spyOn(fs, 'mkdirSync').mockReturnValue(undefined);
+    createWriteStreamSpy = jest.spyOn(fs, 'createWriteStream').mockReturnValue(mockStream);
+  });
+
+  afterEach(async () => {
+    mkdirSyncSpy.mockRestore();
+    createWriteStreamSpy.mockRestore();
+    await closeLogStream();
+  });
+
+  test('writeTokenUsage serializes _schema:"token-usage/v1" into the JSONL stream', () => {
+    const record = {
+      _schema: 'token-usage/v1',
+      timestamp: new Date().toISOString(),
+      request_id: 'direct-write-test',
+      provider: 'openai',
+      model: 'gpt-4o',
+      path: '/v1/chat/completions',
+      status: 200,
+      streaming: false,
+      input_tokens: 1,
+      output_tokens: 1,
+      cache_read_tokens: 0,
+      cache_write_tokens: 0,
+      duration_ms: 10,
+    };
+
+    writeTokenUsage(record);
+
+    expect(mockStream.write).toHaveBeenCalledTimes(1);
+    const parsed = mockStream.writtenRecords[0];
+    expect(parsed._schema).toBe('token-usage/v1');
+    expect(parsed.request_id).toBe('direct-write-test');
+  });
+
+  test('trackTokenUsage HTTP path writes _schema:"token-usage/v1" to the stream', (done) => {
+    const proxyRes = new EventEmitter();
+    proxyRes.headers = { 'content-type': 'application/json' };
+    proxyRes.statusCode = 200;
+
+    trackTokenUsage(proxyRes, {
+      requestId: 'schema-field-http',
+      provider: 'openai',
+      path: '/v1/chat/completions',
+      startTime: Date.now(),
+      metrics: null,
+    });
+
+    proxyRes.emit('data', Buffer.from(JSON.stringify({
+      model: 'gpt-4o',
+      usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 },
+    })));
+    proxyRes.emit('end');
+
+    setTimeout(() => {
+      expect(mockStream.write).toHaveBeenCalledTimes(1);
+      const parsed = mockStream.writtenRecords[0];
+      expect(parsed._schema).toBe('token-usage/v1');
+      expect(parsed.request_id).toBe('schema-field-http');
+      done();
+    }, 20);
+  });
+
+  test('trackWebSocketTokenUsage path writes _schema:"token-usage/v1" to the stream', (done) => {
+    const socket = new EventEmitter();
+
+    function buildFrame(text) {
+      const payload = Buffer.from(text, 'utf8');
+      const header = Buffer.alloc(2);
+      header[0] = 0x81;
+      header[1] = payload.length;
+      return Buffer.concat([header, payload]);
+    }
+
+    const httpHeader = Buffer.from('HTTP/1.1 101 Switching Protocols\r\nUpgrade: websocket\r\n\r\n');
+    const frame1 = buildFrame(JSON.stringify({
+      type: 'message_start',
+      message: { model: 'claude-sonnet-4-20250514', usage: { input_tokens: 20, output_tokens: 0 } },
+    }));
+    const frame2 = buildFrame(JSON.stringify({
+      type: 'message_delta',
+      usage: { output_tokens: 8 },
+    }));
+
+    trackWebSocketTokenUsage(socket, {
+      requestId: 'schema-field-ws',
+      provider: 'anthropic',
+      path: '/v1/messages',
+      startTime: Date.now(),
+      metrics: null,
+    });
+
+    socket.emit('data', Buffer.concat([httpHeader, frame1, frame2]));
+    socket.emit('close');
+
+    setTimeout(() => {
+      expect(mockStream.write).toHaveBeenCalledTimes(1);
+      const parsed = mockStream.writtenRecords[0];
+      expect(parsed._schema).toBe('token-usage/v1');
+      expect(parsed.request_id).toBe('schema-field-ws');
+      done();
+    }, 20);
+  });
+});
@@ -1,3 +1,3 @@
-{"ts":1774290908.910,"client":"172.30.0.20","host":"api.github.com:443","dest":"140.82.116.5:443","method":"CONNECT","status":200,"decision":"TCP_TUNNEL","url":"api.github.com:443"}
-{"ts":1774290909.180,"client":"172.30.0.20","host":"api.github.com:443","dest":"140.82.116.5:443","method":"CONNECT","status":200,"decision":"TCP_TUNNEL","url":"api.github.com:443"}
-{"ts":1774290909.186,"client":"172.30.0.20","host":"evil.example.com:443","dest":"-:-","method":"CONNECT","status":403,"decision":"TCP_DENIED","url":"evil.example.com:443"}
+{"_schema":"audit/v1","ts":1774290908.910,"client":"172.30.0.20","host":"api.github.com:443","dest":"140.82.116.5:443","method":"CONNECT","status":200,"decision":"TCP_TUNNEL","url":"api.github.com:443"}
+{"_schema":"audit/v1","ts":1774290909.180,"client":"172.30.0.20","host":"api.github.com:443","dest":"140.82.116.5:443","method":"CONNECT","status":200,"decision":"TCP_TUNNEL","url":"api.github.com:443"}
+{"_schema":"audit/v1","ts":1774290909.186,"client":"172.30.0.20","host":"evil.example.com:443","dest":"-:-","method":"CONNECT","status":403,"decision":"TCP_DENIED","url":"evil.example.com:443"}
@@ -0,0 +1,40 @@
+# AWF JSONL Schemas
+
+This directory contains versioned [JSON Schema](https://json-schema.org/) files for the JSONL artifact files emitted by AWF at runtime.
+
+## Files
+
+| Schema file | JSONL file | Writer |
+|---|---|---|
+| [`token-usage.v1.schema.json`](token-usage.v1.schema.json) | `token-usage.jsonl` | `containers/api-proxy/token-tracker.js` |
+| [`audit.v1.schema.json`](audit.v1.schema.json) | `audit.jsonl` | Squid proxy (`src/squid-config.ts`) |
+
+## Schema versioning policy
+
+- **Additive changes** (new optional fields) → update the existing `v1` schema, no version bump required.
+- **Breaking changes** (field removal, rename, type change, new required field) → create a new `v2` schema file and bump the `_schema` value in the writer.
+
+## Record identification
+
+Every JSONL record includes a `_schema` field that identifies the schema name and version:
+
+```json
+{ "_schema": "token-usage/v1", "timestamp": "2025-01-01T00:00:00.000Z", ... }
+{ "_schema": "audit/v1", "ts": 1761074374.646, ... }
+```
+
+Consumers should check `_schema` before parsing fields so they can handle future versions gracefully.
+
+## Validation
+
+You can validate a JSONL file against its schema using any JSON Schema validator. Example using [`ajv-cli`](https://github.com/ajv-validator/ajv-cli):
+
+```bash
+# Install validator
+npm install -g ajv-cli
+
+# Validate all records in audit.jsonl
+while IFS= read -r line; do
+  echo "$line" | ajv validate -s schemas/audit.v1.schema.json -d /dev/stdin
+done < /path/to/audit.jsonl
+```