From fa6f22987e1e3c2eeefd14ebe30ddf5f84f2cfff Mon Sep 17 00:00:00 2001 From: tiennm99 Date: Mon, 18 May 2026 15:11:18 +0700 Subject: [PATCH] revert(deploy): roll back EventBridge HTTPS schedule attempts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts 585d996 + c70b9d0. Both CFN deploys failed at changeset validation; prod stack was never mutated. Approach A (AWS::Scheduler::Schedule with arn:aws:scheduler:::http-invoke target) is unimplementable in pure CloudFormation — the Target schema has no property for HTTP endpoint/method/headers, regardless of name. Replacement landing in a follow-up commit. Restored: plans/reports/brainstorm-260517-1411-eventbridge-schedule-fix.md (useful design context even though Approach A invalidated). --- .github/workflows/deploy.yml | 22 +- .../phase-05-eventbridge-schedules.md | 215 +++++++----------- plans/260510-0234-pre-deploy-wrapup/plan.md | 2 +- template.yaml | 41 +--- 4 files changed, 90 insertions(+), 190 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 4f05232..1b7ef7b 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -43,23 +43,17 @@ jobs: - name: SAM deploy env: ALERT_EMAIL: ${{ secrets.ALERT_EMAIL }} - STACK_ENV: prod run: | - set -euo pipefail - # EventBridge Scheduler can't resolve {{resolve:ssm-secure}} in - # HttpInvokeArgs.HeaderParameters; fetch and pass as a NoEcho param. - CRON_SECRET=$(aws ssm get-parameter \ - --name "/miti99bot/${STACK_ENV}/cron-shared-secret" \ - --with-decryption --query Parameter.Value --output text) - echo "::add-mask::$CRON_SECRET" - OVERRIDES="CronSharedSecret=$CRON_SECRET" if [ -n "$ALERT_EMAIL" ]; then - OVERRIDES="$OVERRIDES AlertEmail=$ALERT_EMAIL" + sam deploy --template-file template.yaml \ + --no-confirm-changeset \ + --no-fail-on-empty-changeset \ + --parameter-overrides "AlertEmail=$ALERT_EMAIL" + else + sam deploy --template-file template.yaml \ + --no-confirm-changeset \ + --no-fail-on-empty-changeset fi - sam deploy --template-file template.yaml \ - --no-confirm-changeset \ - --no-fail-on-empty-changeset \ - --parameter-overrides "$OVERRIDES" - name: Smoke test (Function URL responds) run: | diff --git a/plans/260510-0234-pre-deploy-wrapup/phase-05-eventbridge-schedules.md b/plans/260510-0234-pre-deploy-wrapup/phase-05-eventbridge-schedules.md index 89bee4a..e32768b 100644 --- a/plans/260510-0234-pre-deploy-wrapup/phase-05-eventbridge-schedules.md +++ b/plans/260510-0234-pre-deploy-wrapup/phase-05-eventbridge-schedules.md @@ -1,162 +1,105 @@ --- phase: 5 -title: "Wire EventBridge schedule for lolschedule_daily_push" -status: pending -priority: P1 -effort: "1h" -dependencies: [3] +title: "Wire EventBridge schedules to live cron handlers" +status: deferred +priority: P3 +effort: "30m" +dependencies: [3, 4] --- -> **Status update 2026-05-18:** Reactivated from `deferred`. Brainstorm 2026-05-17 locked in all four design decisions; this phase is now executable. See `plans/reports/brainstorm-260517-1411-eventbridge-schedule-fix.md`. +> **Status update 2026-05-10:** Deferred to first-deploy decision. Two issues surfaced during Phase 03/04 implementation: +> 1. **Trading module has no cron** in upstream — only one schedule needed (lolschedule_daily_push), not two +> 2. **Lambda Web Adapter only handles HTTP-shape events** — direct Scheduler→Lambda invokes bypass LWA, requiring either an event-shape detector in `main.go` or the HTTPS-target universal-invoke pattern (`arn:aws:scheduler:::http-invoke`, added in 2024) +> +> The HTTPS-target syntax in `AWS::Scheduler::Schedule` needs validation against the deploy-region SAM transform; doing this offline without `sam validate` access risks committing infra that won't deploy. Decision deferred to deploy-time. Once user runs Phase 01 of AWS-port plan and has SAM available, add a single schedule for `lolschedule_daily_push` per the prose below — pick HTTPS or direct invoke based on what `sam validate` accepts. -# Phase 05: Wire EventBridge schedule for lolschedule_daily_push - -## Context -- Brainstorm: `plans/reports/brainstorm-260517-1411-eventbridge-schedule-fix.md` -- Diagnosis: `/cron/lolschedule_daily_push` route + auth + dispatcher all work; no `AWS::Scheduler::Schedule` resource exists in `template.yaml` (gap noted at `template.yaml:177`). -- Cron handler: `internal/modules/lolschedule/cron.go:53-82` (`dailyPushCronName = "lolschedule_daily_push"`, schedule `0 1 * * *` UTC = 08:00 ICT). -- Trading module has no cron handler — only one schedule needed. +# Phase 05: Wire EventBridge schedules to live cron handlers ## Overview -Add one `AWS::Scheduler::Schedule` resource to `template.yaml` that invokes the Lambda Function URL via EventBridge Scheduler HTTPS-target (`arn:aws:scheduler:::http-invoke`), with `X-Cron-Token` header sourced from SSM Parameter Store via `{{resolve:ssm-secure}}`. Verify the SSM param exists pre-deploy; verify auto-fire post-deploy via Console "Run now" + CloudWatch. +With Phases 03 + 04 landed, two cron routes exist (`/cron/lolschedule_daily_push`, `/cron/trading_daily_refresh`). This phase adds concrete `AWS::Scheduler::Schedule` resources to `template.yaml` so AWS Scheduler invokes them on schedule via the existing `SchedulerExecutionRole` + `CronDLQ` already provisioned by AWS-port Phase 01. ## Requirements -- **Functional:** Schedule fires daily at 01:00 UTC. Lambda receives POST with valid `X-Cron-Token`. Handler logs `cron triggered name=lolschedule_daily_push` + `lolschedule daily push complete`. Failures retried 2× with 600s max age. Permanent failures land in `CronDLQ`. -- **Non-functional:** Stays in EventBridge Scheduler free tier (~30 invocations/mo vs 14M limit). Token rotation = SSM update + redeploy (accepted trade-off). +- **Functional:** Two schedules deploy via SAM. Each fires at the correct cron expression with `X-Cron-Token` header sourced from Parameter Store. Failures land in `CronDLQ`. +- **Non-functional:** Stays inside EventBridge Scheduler free tier (14M invocations/mo; we use ~60). Token rotation = update SSM param + redeploy (acceptable trade-off). ## Architecture ``` -EventBridge Scheduler ─cron(0 1 * * ? *) UTC─► arn:aws:scheduler:::http-invoke - SchedulerExecutionRole POST cron/lolschedule_daily_push - RetryPolicy: 2 attempts, 600s Header: X-Cron-Token: {{resolve:ssm-secure:.../cron-shared-secret}} - DLQ: CronDLQ Body: "{}" - │ - ▼ - Lambda (existing route) - router → dispatcher → dailyPushHandler +EventBridge Scheduler (rule: 0 1 * * ? *) ─HTTPS POST─► /cron/lolschedule_daily_push + + Headers: X-Cron-Token: {{from SSM}} + + Retry: max 2, max-age 600s + + DLQ: CronDLQ on permanent failure + +EventBridge Scheduler (rule: 0 8 * * ? *) ─HTTPS POST─► /cron/trading_daily_refresh + (same auth + retry + DLQ shape) ``` -IAM `lambda:InvokeFunctionUrl` already granted to `scheduler.amazonaws.com` at `template.yaml:170` — no IAM changes. +**HTTPS target syntax:** EventBridge Scheduler uses `arn:aws:scheduler:::http-invoke` with `HttpParameters` carrying headers. SAM's `AWS::Scheduler::Schedule` resource passes through to this; no SAM transform magic needed. ## Related Code Files -- Modify: `template.yaml` — append single `AWS::Scheduler::Schedule` resource after `SchedulerExecutionRole` (~line 178), inside existing `# --- Cron ---` block. -- Reference (no edit): existing `SchedulerExecutionRole`, `CronDLQ`, `BotFunctionUrl` output in `template.yaml`. -- Reference (no edit): `aws/README.md` §2 — SSM cron-shared-secret setup. +- Modify: `template.yaml` — append `LolscheduleDailyPushSchedule` + `TradingDailyRefreshSchedule` resources +- Reference (no edit): existing `SchedulerExecutionRole` + `CronDLQ` in `template.yaml` +- Reference: `aws/README.md` (SSM parameter setup for `/miti99bot/prod/cron-shared-secret`) ## Implementation Steps +1. Confirm AWS SDK / CloudFormation supports `aws.HttpInvoke` target via `AWS::Scheduler::Schedule` for the deploy region (`ap-southeast-1`). Check via `aws cloudformation describe-type --type RESOURCE --type-name AWS::Scheduler::Schedule` if uncertain. +2. Append to `template.yaml`: + ```yaml + LolscheduleDailyPushSchedule: + Type: AWS::Scheduler::Schedule + Properties: + Name: !Sub "${AWS::StackName}-lolschedule-daily-push" + ScheduleExpression: "cron(0 1 * * ? *)" # 01:00 UTC = 08:00 ICT + FlexibleTimeWindow: { Mode: OFF } + State: ENABLED + Target: + Arn: !GetAtt BotFunction.Arn # Lambda direct? Or HTTPS? Decide per step 1 + RoleArn: !GetAtt SchedulerExecutionRole.Arn + RetryPolicy: { MaximumRetryAttempts: 2, MaximumEventAgeInSeconds: 600 } + DeadLetterConfig: { Arn: !GetAtt CronDLQ.Arn } + Input: '{"name":"lolschedule_daily_push"}' + # IF using HTTPS invoke (preferred for route preservation): + # Replace `Arn: !GetAtt BotFunction.Arn` with the universal target + # `Arn: arn:aws:scheduler:::http-invoke` and add HttpParameters. -### 1. Pre-deploy: verify SSM secret exists -```sh -aws ssm get-parameter --name /miti99bot/prod/cron-shared-secret \ - --with-decryption --region ap-southeast-1 \ - --query 'Parameter.Value' --output text -``` -Must return a non-empty value. If missing or empty: -```sh -openssl rand -hex 32 | xargs -I{} aws ssm put-parameter \ - --name /miti99bot/prod/cron-shared-secret \ - --value {} --type SecureString --region ap-southeast-1 -``` -Rationale: `cmd/server/main.go:124` silently disables `/cron/*` (404 all hits) when `CRON_SHARED_SECRET` is empty. `{{resolve:ssm-secure}}` in template fails `sam deploy` loudly if the parameter is missing. - -### 2. Append to `template.yaml` (after `SchedulerExecutionRole`, before `# --- Cost guard ---`) -```yaml - LolscheduleDailyPushSchedule: - Type: AWS::Scheduler::Schedule - Properties: - Name: !Sub "${AWS::StackName}-lolschedule-daily-push" - ScheduleExpression: "cron(0 1 * * ? *)" # 01:00 UTC = 08:00 ICT - ScheduleExpressionTimezone: UTC - FlexibleTimeWindow: { Mode: OFF } - State: ENABLED - Target: - Arn: arn:aws:scheduler:::http-invoke - RoleArn: !GetAtt SchedulerExecutionRole.Arn - Input: "{}" - RetryPolicy: - MaximumRetryAttempts: 2 - MaximumEventAgeInSeconds: 600 - DeadLetterConfig: - Arn: !GetAtt CronDLQ.Arn - HttpInvokeArgs: - EndpointUrl: !Sub "${BotFunctionUrl.FunctionUrl}cron/lolschedule_daily_push" - HttpMethod: POST - HeaderParameters: - X-Cron-Token: !Sub "{{resolve:ssm-secure:/miti99bot/${StackEnv}/cron-shared-secret}}" -``` - -### 3. Local validate (sam-validate gate) -```sh -make sam-validate -``` -If `HttpInvokeArgs` is rejected by current SAM transform, iterate property name: -- Candidate A: `HttpParameters` (older EventBridge Rules shape, may apply) -- Candidate B: nested under `Target.HttpInvokeParameters` -- Candidate C: top-level `Target.HttpInvocationConfig` -- Last resort: switch to `AWS::Events::Connection` + `AWS::Events::ApiDestination` + `AWS::Events::Rule` (three resources instead of one; documented well). - -Verify URL trailing-slash: `!GetAtt BotFunctionUrl.FunctionUrl` returns `https://….on.aws/`; concatenation `${...}cron/lolschedule_daily_push` yields a clean single-slash path. Confirm in `sam package` output if uncertain. - -### 4. Commit + deploy via CI -- Commit message: `feat(deploy): wire EventBridge schedule for lolschedule daily push` -- Push to `main` → `.github/workflows/deploy.yml` runs `sam deploy` via OIDC. -- Watch the workflow run for `LolscheduleDailyPushSchedule` CREATE_COMPLETE in CloudFormation output. - -### 5. Post-deploy: verify schedule fires correctly -1. AWS Console → EventBridge → Scheduler → `miti99bot-lolschedule-daily-push` → **Run now**. -2. CloudWatch Logs `/aws/lambda/miti99bot` → expect within 60s: - - `cron triggered route=/cron name=lolschedule_daily_push` - - `lolschedule daily push complete subscribers=N sent=N failed=0 pruned=0` -3. Synthetic 401 test: - ```sh - URL=$(aws cloudformation describe-stacks --stack-name miti99bot \ - --query "Stacks[0].Outputs[?OutputKey=='FunctionUrl'].OutputValue" --output text) - curl -i -X POST "${URL}cron/lolschedule_daily_push" -H "X-Cron-Token: wrong" + TradingDailyRefreshSchedule: + Type: AWS::Scheduler::Schedule + Properties: + Name: !Sub "${AWS::StackName}-trading-daily-refresh" + ScheduleExpression: "cron(0 8 * * ? *)" # 08:00 UTC = 15:00 ICT (market close) + FlexibleTimeWindow: { Mode: OFF } + State: ENABLED + Target: + # Same shape as above + RoleArn: !GetAtt SchedulerExecutionRole.Arn + RetryPolicy: { MaximumRetryAttempts: 2, MaximumEventAgeInSeconds: 600 } + DeadLetterConfig: { Arn: !GetAtt CronDLQ.Arn } + Input: '{"name":"trading_daily_refresh"}' ``` - Expect HTTP 401 + log line `cron rejected reason=secret_mismatch`. -4. DLQ sanity check: `aws sqs get-queue-attributes --queue-url --attribute-names ApproximateNumberOfMessages` — must be 0 after the successful "Run now". - -### 6. Next-day auto-fire check -At 01:01 UTC the day after deploy, re-tail CloudWatch — confirm the scheduled fire occurred. Mark phase `status: done` only after this passes. - -## Todo -- [ ] Step 1: Verify (or create) `/miti99bot/prod/cron-shared-secret` SSM param -- [ ] Step 2: Append `LolscheduleDailyPushSchedule` resource to `template.yaml` -- [ ] Step 3: `make sam-validate` passes (iterate property name if rejected) -- [ ] Step 4: Commit + push to `main`; CI deploy succeeds -- [ ] Step 5a: Console "Run now" → handler logs in CloudWatch within 60s -- [ ] Step 5b: Wrong-token curl → 401 + audit log -- [ ] Step 5c: DLQ empty -- [ ] Step 6: Next-day 01:00 UTC auto-fire observed → mark phase done +3. **Decide direct-invoke vs HTTPS** at implementation time: + - **HTTPS (preferred):** preserves `/cron/{name}` route; works with existing dispatcher; same shape as local-dev `curl` smoke. Need `HttpParameters` block with `X-Cron-Token` header. + - **Direct Lambda invoke:** simpler IAM, lower latency, bypasses HTTP layer. Requires a Lambda event-shape branch in `cmd/server/main.go` to detect Scheduler events vs Function URL events. + - Default: HTTPS for KISS; switch only if HTTPS proves flaky. +4. Validate locally: `make sam-validate` should pass. +5. After AWS-port Phase 01 deploy: + - Console → EventBridge Scheduler → "Run now" each rule. Confirm 200 from Lambda. + - Check CloudWatch log group for the cron handler executing. + - Send a synthetic invocation that fails (wrong token) — confirm DLQ receives the failed message. +6. Watch first scheduled fire from the AWS console (use a temporary `rate(2 minutes)` to verify, then revert). ## Success Criteria -- [ ] One new resource in `template.yaml`, zero Go/IAM/secret changes +- [ ] Two schedules in `template.yaml` - [ ] `sam validate` passes -- [ ] CI deploy succeeds without manual intervention -- [ ] "Run now" returns 200 and triggers handler within 60s -- [ ] Wrong/missing `X-Cron-Token` → 401, no handler invocation -- [ ] DLQ remains empty under normal operation -- [ ] Next scheduled fire (01:00 UTC) executes automatically +- [ ] Post-deploy: Manual "run now" returns 200 and triggers handler +- [ ] DLQ receives failed invocations (synthetic test) +- [ ] First scheduled fire happens at the correct UTC time ## Risk Assessment -| Risk | Likelihood | Impact | Mitigation | -|---|---|---|---| -| `HttpInvokeArgs` property name wrong for current SAM transform | Med | Deploy fails | `sam validate` gate locally before commit; iterate property name (see Step 3 candidates) | -| SSM `cron-shared-secret` missing or empty | Low | Deploy fails OR Lambda silently 404s | Pre-deploy verification step (Step 1) | -| Function URL concat double-slash | Low | 404 from Lambda | URL ends `/`, path starts `cron/` (no leading slash) — single slash by construction; confirm in `sam package` output | -| Token rotation breaks production | Low | Cron stops firing until redeploy | Phase-05 brainstorm accepted: rotation = SSM update + redeploy. Document in `aws/README.md` if not already | -| First scheduled fire misses (timezone) | Low | Cron fires 7h early or late | `ScheduleExpressionTimezone: UTC` explicit; matches code constant; documented in inline comment | -| Cold start exceeds 30s timeout during cron | Low | First fire times out, 2 retries | Retry policy covers; if persistent, increase Globals.Function.Timeout from 30s | +- **`AWS::Scheduler::Schedule` HTTPS-target syntax** still evolving — mitigated by step 1 confirmation and ability to fall back to direct invoke. +- **Token mismatch between SSM and Lambda env** — both resolve at deploy time from the same parameter; no drift unless one is rotated independently. +- **Cron firing before Lambda is deployed** during stack creation — CloudFormation orders dependencies; Schedules `DependsOn: BotFunction` if needed (probably auto from Arn ref). +- **Time-zone confusion** — cron expressions use UTC; verified in comments next to each expression. -## Security Considerations -- `X-Cron-Token` is a shared secret embedded into the schedule definition via `{{resolve:ssm-secure}}` at deploy time. The resolved value is visible in the EventBridge Scheduler console (Target → Headers) to anyone with `scheduler:GetSchedule` IAM. Accepted: same blast radius as the Lambda env var holding the same secret. -- Function URL is `AuthType: NONE` — anyone can hit `/cron/lolschedule_daily_push` with the right token. Constant-time compare at `internal/server/router.go:76` prevents timing-attack leakage. -- DLQ contents may contain the request body (`"{}"`, no sensitive data) but DO contain the failed-invocation metadata. SQS queue is private to the AWS account. - -## Next Steps -- After Step 6 passes, mark this phase `status: done` and update parent `plan.md` Phase 5 row. -- This closes the `260510-0234-pre-deploy-wrapup` plan (Phases 01-04 already done). -- Unblocks `260510-0114-aws-port` Phase 04 verification (which becomes a no-op now that this phase delivers the same outcome via the same shape). - -## Open Questions -None — brainstorm closed all four discovery items. The only remaining open item is the `HttpInvokeArgs` property-name validation, absorbed into Step 3 as a validate-and-iterate gate. +## Open questions +1. Direct invoke vs HTTPS — final decision lives here, not Phase 04 of AWS-port plan. +2. Add a third schedule for a manual "ad-hoc" endpoint (e.g. for testing without console)? YAGNI — `aws scheduler invoke-now` works. +3. Schedule `State: ENABLED` vs `DISABLED` initially? ENABLED — first deploy implicitly trusts the cron handlers; if either causes prod issues, disable via console immediately. diff --git a/plans/260510-0234-pre-deploy-wrapup/plan.md b/plans/260510-0234-pre-deploy-wrapup/plan.md index b0f14d0..3ed3106 100644 --- a/plans/260510-0234-pre-deploy-wrapup/plan.md +++ b/plans/260510-0234-pre-deploy-wrapup/plan.md @@ -32,7 +32,7 @@ From the punch-list: | 02 | [Cold-start metric filter](phase-02-metric-filter.md) | done | 30m | `AWS::Logs::MetricFilter` for `Init Duration` in `template.yaml` | | 03 | [lolschedule daily-push cron](phase-03-lolschedule-cron.md) | done | 3h | `Crons()` registered; Deps exposes bot for fan-out; daily push at 08:00 ICT | | 04 | [Trading module port](phase-04-trading-module.md) | done (scope-trimmed: no daily refresh cron, no leaderboard — neither in upstream) | 4h | VN-stocks paper trading: topup/buy/sell/stats/convert; KBS price source | -| 05 | [Wire EventBridge schedules](phase-05-eventbridge-schedules.md) | pending | 1h | `AWS::Scheduler::Schedule` HTTPS-invoke for `lolschedule_daily_push` — design locked by brainstorm 2026-05-17 | +| 05 | [Wire EventBridge schedules](phase-05-eventbridge-schedules.md) | **deferred to first-deploy decision** | 30m | `AWS::Scheduler::Schedule` resource for lolschedule cron — needs HTTPS-vs-direct-invoke call validated against live SAM CLI | ## Dependency graph ``` diff --git a/template.yaml b/template.yaml index 9734447..ece0e88 100644 --- a/template.yaml +++ b/template.yaml @@ -40,19 +40,6 @@ Parameters: Default: "" Description: Email for $1 budget alert. Leave empty to skip the budget resource. - # CFN does not allow {{resolve:ssm-secure:...}} inside - # AWS::Scheduler::Schedule HttpInvokeArgs.HeaderParameters (documented - # CloudFormation limitation — secure-string dynamic refs have a fixed - # property allowlist that excludes Scheduler header parameters). CI fetches - # /miti99bot/${StackEnv}/cron-shared-secret from SSM at deploy time and - # passes it here via --parameter-overrides. NoEcho masks the value in the - # CloudFormation console. - CronSharedSecret: - Type: String - NoEcho: true - Default: "" - Description: Shared secret EventBridge Scheduler attaches as the X-Cron-Token header. Sourced from SSM by CI. - Conditions: HasAlertEmail: !Not [!Equals [!Ref AlertEmail, ""]] @@ -187,32 +174,8 @@ Resources: Action: sqs:SendMessage Resource: !GetAtt CronDLQ.Arn - LolscheduleDailyPushSchedule: - Type: AWS::Scheduler::Schedule - Properties: - Name: !Sub "${AWS::StackName}-lolschedule-daily-push" - ScheduleExpression: "cron(0 1 * * ? *)" # 01:00 UTC = 08:00 ICT - ScheduleExpressionTimezone: UTC - FlexibleTimeWindow: { Mode: OFF } - State: ENABLED - Target: - # HTTPS universal target — preserves the /cron/{name} route inside the - # Lambda. IAM (lambda:InvokeFunctionUrl) is already granted to - # scheduler.amazonaws.com on SchedulerExecutionRole above. - Arn: arn:aws:scheduler:::http-invoke - RoleArn: !GetAtt SchedulerExecutionRole.Arn - Input: "{}" - RetryPolicy: - MaximumRetryAttempts: 2 - MaximumEventAgeInSeconds: 600 - DeadLetterConfig: - Arn: !GetAtt CronDLQ.Arn - HttpInvokeArgs: - # FunctionUrl ends with '/'; path starts without one → clean single-slash join. - EndpointUrl: !Sub "${BotFunctionUrl.FunctionUrl}cron/lolschedule_daily_push" - HttpMethod: POST - HeaderParameters: - X-Cron-Token: !Ref CronSharedSecret + # Concrete AWS::Scheduler::Schedule resources are added per cron handler; + # the role + DLQ above are provisioned once and reused across all schedules. # --- Cost guard -----------------------------------------------------------