{"ok":true,"service":"discovery-deduplication-readiness","mode":"deduplication-readiness-layer","timestamp":"2026-06-26T14:59:49.047Z","dependencies":{"discoveryFoundation":{"foundationReady":true,"liveSearchEnabled":false,"capsuleCompressionEnabled":false},"providerRegistry":{"registryReady":true,"liveProviderRetrievalEnabled":false,"providerCredentialsAttached":false},"sourceScoring":{"scoringReadinessReady":true,"liveScoringEnabled":false,"automatedSourceRejectionEnabled":false}},"summary":{"totalDeduplicationTypes":8,"typeReady":2,"typeFoundationReady":3,"typeReviewRequired":3,"totalRules":9,"ready":6,"reviewRequired":3,"blocked":0,"deduplicationReadinessReady":true,"liveDeduplicationEnabled":false,"automaticSourceDeletionEnabled":false,"vaultIngestionAllowed":false,"capsuleCompressionAllowed":false,"productionMutation":false,"reason":"Deduplication readiness is active as a non-destructive policy layer. TheoB can define duplicate clustering logic, but it cannot process live provider results, delete sources, write Vault records, or compress capsules yet."},"deduplicationTypes":[{"type":"exact-url-duplicate","status":"ready","purpose":"Detect identical canonical URLs returned by multiple providers or repeated searches.","safeguard":"A repeated URL is one source, not multiple confirmations."},{"type":"canonical-url-variant","status":"ready","purpose":"Cluster URLs that differ only by tracking parameters, mobile versions, trailing slashes, fragments, or protocol variants.","safeguard":"Strip noise carefully without destroying meaningful URL identity."},{"type":"syndicated-content-duplicate","status":"foundation-ready","purpose":"Detect when the same article or release appears across multiple publishers, mirrors, or syndication networks.","safeguard":"Syndication should not be mistaken for independent corroboration."},{"type":"near-text-duplicate","status":"foundation-ready","purpose":"Detect pages that are not identical but substantially repeat the same language, summary, or claim structure.","safeguard":"Near duplicates should lower independence, not erase all records."},{"type":"claim-level-duplicate","status":"foundation-ready","purpose":"Cluster sources repeating the same claim even when the article, page, or file is different.","safeguard":"Claims need independent-source context before confidence increases."},{"type":"dataset-version-duplicate","status":"review-required","purpose":"Detect when datasets represent different versions, revisions, snapshots, or mirrors of the same underlying data.","safeguard":"Dataset revisions must preserve version, timestamp, schema, and source provenance."},{"type":"image-visual-duplicate","status":"review-required","purpose":"Detect reused images, resized images, cropped images, screenshots, and visual near-duplicates.","safeguard":"Visual similarity must preserve uncertainty and source trail."},{"type":"diagram-schematic-duplicate","status":"review-required","purpose":"Detect reused diagrams, CAD exports, schematics, floor plans, maps, and engineering visual variants.","safeguard":"Schematic differences may be meaningful and cannot be flattened blindly."}],"readinessRules":[{"rule":"Deduplicate Before Trust","status":"ready","purpose":"TheoB must cluster repeated sources before scoring confidence.","safeguard":"Repetition is not verification."},{"rule":"Preserve Original Source Trail","status":"ready","purpose":"Every duplicate cluster must preserve all member sources and original provider trails.","safeguard":"Deduplication must compress noise, not erase provenance."},{"rule":"Canonicalization Must Be Explainable","status":"ready","purpose":"URL normalization and duplicate clustering must show why records were grouped.","safeguard":"No invisible source merging."},{"rule":"Independent Corroboration Must Be Separated","status":"ready","purpose":"TheoB must distinguish copied repetition from independent support.","safeguard":"A copied article is not the same as a second witness."},{"rule":"No Live Deduplication Yet","status":"ready","purpose":"This layer defines readiness only and does not process live provider results.","safeguard":"No provider queries, Vault writes, or automatic source changes."},{"rule":"No Automatic Source Deletion","status":"ready","purpose":"Duplicates may be clustered, but records should not be deleted automatically.","safeguard":"Keep auditability and reversibility."},{"rule":"Claim-Level Deduplication Needs Scoring Context","status":"review-required","purpose":"Duplicate claims should eventually connect to source scoring and conflict detection.","safeguard":"A repeated claim can still be wrong."},{"rule":"Multimodal Deduplication Requires Separate Methods","status":"review-required","purpose":"Images, diagrams, maps, schematics, CAD, and datasets require specialized duplicate logic.","safeguard":"Do not use text-only duplicate logic for visual or structured files."},{"rule":"Capsule Compression Depends On Deduplication","status":"review-required","purpose":"Intelligence Capsules should be built from deduplicated source clusters, not raw repeated noise.","safeguard":"A capsule must preserve enough truth to be reawakened faithfully."}],"futureDuplicateClusterShape":{"duplicateClusterId":"stable duplicate cluster id","clusterType":"exact-url/canonical-url/syndicated-content/near-text/claim-level/dataset-version/image-visual/diagram-schematic","canonicalReferenceId":"preferred reference card id","memberReferenceIds":"array of all linked reference ids","providerIds":"array of discovery provider ids","sourceUrls":"array of source URLs or safe source IDs","duplicateReason":"human-readable explanation of why records were clustered","independenceScore":"0-100","repetitionCount":"number","independentSourceCount":"number","firstSeenAt":"ISO timestamp","lastSeenAt":"ISO timestamp","conflictStatus":"none/partial/strong/unknown","sourceTrailPreserved":"true","productionMutation":"false"},"futureCanonicalizationShape":{"canonicalizationId":"stable canonicalization id","originalUrl":"original source URL","canonicalUrl":"normalized canonical URL","removedParameters":"tracking or non-semantic parameters removed","normalizationRules":"array of applied normalization rules","confidence":"low/medium/high","reviewRequired":"true/false","explanation":"human-readable canonicalization explanation"},"allowedNow":["Render deduplication readiness.","Define duplicate types.","Define duplicate cluster shape.","Define canonicalization shape.","Separate repetition from independent corroboration.","Keep live deduplication disabled.","Keep automatic source deletion disabled."],"notAllowedYet":["Deduplicate live provider results.","Query discovery providers.","Delete duplicate sources automatically.","Write duplicate clusters to the Vault.","Compress duplicate clusters into capsules.","Treat repetition as verification.","Apply text-only deduplication to images, CAD, schematics, maps, or datasets."],"nextStructuralLayers":["Discovery Conflict Detection Readiness","Discovery Vault Ingestion Readiness","TheoB Intelligence Capsule Engine Foundation","Image Duplicate Detection Readiness","Diagram And Schematic Deduplication Readiness","Visual Semantics Color Intelligence Registry"],"safeguard":"Discovery Deduplication Readiness Layer is read-only and non-destructive. It does not query providers, process live results, delete sources, ingest Vault records, compress capsules, expose secrets, mutate production, or execute agent actions."}