A self-improving knowledge graph system that maintains MECE (Mutually Exclusive, Collectively Exhaustive) compliance through automated refactoring, semantic search, and event-driven monitoring.
interface Entity {
id: string; // Format: "type:canonical_name"
type: string; // Entity type (e.g., "person", "organization", "concept")
canonical: string; // Canonical form (normalized)
aliases: string[]; // Alternative names
embedding?: number[]; // 768-dim vector for semantic search
metadata?: Record<string, any>;
}
interface Triple {
subject: string; // Entity ID
predicate: string; // Relationship type
object: string; // Entity ID
confidence?: number; // Optional confidence score (0-1)
}
interface Graph {
entities: Record<string, Entity>;
triples: Triple[];
entityIndex: {
byType: Record<string, string[]>;
byCanonical: Record<string, string>;
byAlias: Record<string, string>;
};
synonymMap: Record<string, string>; // Runtime-learned synonyms
processedItems: string[]; // Tracking processed sources
lastUpdated: string;
}async function searchEntity(params: {
type?: string;
name: string;
useSemanticSearch?: boolean;
}): Promise<SearchResult> {
// Stage 1: Exact alias match
const normalized = normalize(applySynonym(name));
if (graph.entityIndex.byAlias[normalized]) {
return { found: true, entityId: graph.entityIndex.byAlias[normalized] };
}
// Stage 2: Semantic similarity search
if (useSemanticSearch) {
const queryEmbedding = await generateEmbedding(name);
const candidates = Object.values(graph.entities)
.filter(e => !type || e.type === type)
.filter(e => e.embedding);
const similarities = candidates.map(entity => ({
entity,
similarity: cosineSimilarity(queryEmbedding, entity.embedding!)
})).filter(s => s.similarity >= 0.85); // Threshold: 85%
if (similarities.length > 0) {
const best = similarities.sort((a, b) => b.similarity - a.similarity)[0];
return {
found: true,
entityId: best.entity.id,
suggestion: `Found semantically similar entity (${(best.similarity * 100).toFixed(1)}% match)`
};
}
}
return { found: false };
}Key Features:
- 768-dimensional embeddings (e.g., text-embedding-004)
- Cosine similarity threshold: 0.85 for search, 0.90 for merge
- Finds variants: "organization" ≈ "organisation" ≈ "org"
function addSynonym(params: {
variant: string;
canonical: string;
}): Result {
graph.synonymMap[normalize(variant)] = normalize(canonical);
saveGraph();
return { success: true };
}
function applySynonym(name: string): string {
const normalized = normalize(name);
return graph.synonymMap[normalized] || name;
}Runtime Learning:
- No hardcoded mappings
- Agent learns synonyms during extraction
- Example: "ceo" → "chief_executive_officer"
// Entity ID format: "type:canonical_name"
const NAMING_RULES = {
format: "snake_case",
atomic: true, // One concept per entity
normalized: true, // Lowercased, trimmed
descriptive: true // Clear, unambiguous names
};
// Good examples:
// person:john_doe
// organization:acme_corp
// skill:machine_learning
// location:new_york
// Bad examples:
// JohnDoe (no type prefix)
// person:john-doe-engineer (not atomic, hyphen not snake_case)
// org:ACME (not normalized)async function mergeDuplicates(params: {
entityType?: string;
similarityThreshold?: number;
}): Promise<MergeResult> {
const threshold = params.similarityThreshold || 0.9;
const candidates = Object.values(graph.entities)
.filter(e => !params.entityType || e.type === params.entityType)
.filter(e => e.embedding);
const merged: Array<{from: string; to: string}> = [];
for (let i = 0; i < candidates.length; i++) {
for (let j = i + 1; j < candidates.length; j++) {
const sim = cosineSimilarity(
candidates[i].embedding!,
candidates[j].embedding!
);
if (sim >= threshold) {
// Merge j into i, update all triples
mergeEntities(candidates[j].id, candidates[i].id);
merged.push({ from: candidates[j].id, to: candidates[i].id });
}
}
}
return { success: true, mergedCount: merged.length, merged };
}async function extractWithAutoRefactor(items: Item[]) {
for (let i = 0; i < items.length; i++) {
const itemNumber = i + 1;
// Extract knowledge from item
await extractItem(items[i]);
// AUTO-REFACTOR: Every 10 items
if (itemNumber % 10 === 0) {
console.log(`🔄 AUTO-REFACTORING (after ${itemNumber} items)`);
// 1. Analyze predicates
const predicateAnalysis = analyzePredicates();
if (predicateAnalysis.duplicates.length > 0) {
predicateAnalysis.duplicates.forEach(dup => {
if (dup.predicates.length >= 3) { // Threshold: 3+ similar
console.log(`Consolidating: ${dup.predicates.join(', ')} → ${dup.suggestedMerge}`);
const result = consolidatePredicates({
predicates: dup.predicates,
targetPredicate: dup.suggestedMerge
});
console.log(`✅ Consolidated ${result.updatedCount} triples`);
}
});
}
// 2. Analyze entity types
const typeAnalysis = analyzeEntityTypes();
const underused = typeAnalysis.types.filter(t => t.count === 1);
if (underused.length > 0) {
console.log(`ℹ️ Found ${underused.length} underused types - consider merging`);
}
// 3. Show stats
const stats = getStats();
console.log(`📈 After refactoring: ${stats.predicates} predicates, ${stats.entityTypes} types`);
}
}
}// 1. Consolidate redundant predicates
function consolidatePredicates(params: {
predicates: string[];
targetPredicate: string;
}): Result {
let updatedCount = 0;
graph.triples.forEach(triple => {
if (params.predicates.includes(triple.predicate)) {
triple.predicate = params.targetPredicate;
updatedCount++;
}
});
saveGraph();
return { success: true, updatedCount };
}
// Example:
// Before: has_skill, possesses_skill, knows_skill (3 predicates, 45 triples)
// After: has_skill (1 predicate, 45 triples)
// 2. Rename entity type
function renameEntityType(params: {
oldType: string;
newType: string;
}): Result {
Object.values(graph.entities)
.filter(e => e.type === params.oldType)
.forEach(entity => {
const oldId = entity.id;
entity.type = params.newType;
entity.id = `${params.newType}:${entity.canonical}`;
// Update triples
graph.triples.forEach(triple => {
if (triple.subject === oldId) triple.subject = entity.id;
if (triple.object === oldId) triple.object = entity.id;
});
});
rebuildIndex();
saveGraph();
return { success: true };
}
// 3. Rename predicate
function renamePredicate(params: {
oldPredicate: string;
newPredicate: string;
}): Result {
graph.triples.forEach(triple => {
if (triple.predicate === params.oldPredicate) {
triple.predicate = params.newPredicate;
}
});
saveGraph();
return { success: true };
}interface LogEntry {
time: number; // Unix timestamp (ms)
timestamp: string; // ISO 8601
level: 'info' | 'debug' | 'error';
itemId: string; // Current item being processed
toolName: string; // Tool/function called
input: any; // Input parameters
output?: any; // Output result
duration?: number; // Execution time (ms)
error?: string; // Error message if failed
pid: number; // Process ID
}
function logEvent(event: Omit<LogEntry, 'time' | 'timestamp' | 'pid'>) {
const logEntry: LogEntry = {
time: Date.now(),
timestamp: new Date().toISOString(),
pid: process.pid,
...event
};
// Append to NDJSON file (newline-delimited JSON)
appendFileSync(logPath, JSON.stringify(logEntry) + '\n');
}Example Log Entries:
{"time":1735407123456,"timestamp":"2025-12-28T10:12:03.456Z","level":"info","itemId":"item_001","toolName":"search_entity","input":{"name":"machine_learning","type":"skill"},"pid":12345}
{"time":1735407123733,"timestamp":"2025-12-28T10:12:03.733Z","level":"debug","itemId":"item_001","toolName":"search_entity","input":{"name":"machine_learning","type":"skill"},"output":{"found":true,"entityId":"skill:machine_learning"},"duration":277,"pid":12345}
{"time":1735407890000,"timestamp":"2025-12-28T10:24:50.000Z","level":"info","itemId":"auto-refactor","toolName":"auto_refactor","input":{"action":"consolidate_predicates","predicates":["has_skill","knows_skill"],"targetPredicate":"has_skill"},"pid":12345}Logging Wrapper:
function wrapWithLogging<T>(
toolName: string,
handler: (args: any) => Promise<T>
) {
return async (args: any): Promise<T> => {
const startTime = Date.now();
logEvent({ level: 'info', toolName, input: args });
try {
const result = await handler(args);
const duration = Date.now() - startTime;
logEvent({
level: 'debug',
toolName,
input: args,
output: result,
duration
});
return result;
} catch (error: any) {
const duration = Date.now() - startTime;
logEvent({
level: 'error',
toolName,
input: args,
error: error.message,
duration
});
throw error;
}
};
}// READ TOOLS (6)
const readTools = [
'search_entity', // Find entity by name (semantic search)
'list_entity_types', // List all entity types
'list_entities_by_type', // List entities of specific type
'list_predicates', // List all predicates with examples
'find_triples', // Query triples by pattern
'get_stats' // Graph statistics
];
// WRITE TOOLS (3)
const writeTools = [
'add_entity', // Create new entity (with dedup check)
'add_triple', // Create relationship
'mark_item_processed' // Track processed items
];
// MECE TOOLS (2)
const meceTools = [
'add_synonym', // Map variant → canonical
'merge_duplicates' // Find and merge similar entities
];
// QUALITY TOOLS (2)
const qualityTools = [
'check_isolated_entities', // Find orphaned entities
'cleanup_isolated_entities' // Remove orphans
];// Agent workflow for each item:
async function processItem(item: Item) {
// Step 1: Search before creating
const searchResult = await search_entity({
name: "machine_learning",
type: "skill",
useSemanticSearch: true
});
let entityId: string;
if (searchResult.found) {
entityId = searchResult.entityId; // Reuse existing
} else {
// Step 2: Create new entity
const createResult = await add_entity({
type: "skill",
name: "machine_learning",
metadata: { description: "AI subdomain" }
});
entityId = createResult.entityId;
}
// Step 3: Create relationships
await add_triple({
subject: "person:john_doe",
predicate: "has_skill",
object: entityId
});
// Step 4: Quality check
const isolated = await check_isolated_entities();
if (isolated.count > 0) {
await cleanup_isolated_entities();
}
// Step 5: Mark complete
await mark_item_processed({ itemId: item.id });
}function calculateEntityCompression(graph: Graph): number {
const totalEntities = graph.entities.length;
const totalTriples = graph.triples.length;
const processedItems = graph.processedItems.length;
if (processedItems === 0) return 0;
const avgEntitiesPerItem = totalEntities / processedItems;
const avgTriplesPerItem = totalTriples / processedItems;
// Lower ratio = better reuse (more compression)
const compressionRate = (avgEntitiesPerItem / avgTriplesPerItem) * 100;
return compressionRate;
}
// Target: >50% compression (higher reuse)
// Example: 50 entities, 200 triples, 10 items
// avgEntities = 5, avgTriples = 20
// compression = (5/20) * 100 = 25% ❌ (too many unique entities)
//
// Example: 30 entities, 200 triples, 10 items
// avgEntities = 3, avgTriples = 20
// compression = (3/20) * 100 = 15% ✅ (good reuse)function calculateMECEScore(graph: Graph): number {
const stats = getStats();
const compression = calculateEntityCompression(graph);
const avgDegree = calculateAvgDegree(graph);
// Composite score (0-100)
const score =
(100 - compression) * 0.4 + // 40% weight: entity reuse
(100 - stats.entityTypes * 5) * 0.25 + // 25% weight: type consolidation
(100 - stats.predicates * 5) * 0.25 + // 25% weight: predicate consolidation
(avgDegree / 10) * 0.1; // 10% weight: connectivity
return Math.min(100, Math.max(0, score));
}
// Target: >80/100
// Components:
// - Low entity compression (high reuse)
// - Few entity types (5-10 ideal)
// - Few predicates (3-8 ideal)
// - High connectivity (avg degree >3)// 1. Analyze entity types
function analyzeEntityTypes(): TypeAnalysis {
const types = new Map<string, Entity[]>();
Object.values(graph.entities).forEach(entity => {
if (!types.has(entity.type)) types.set(entity.type, []);
types.get(entity.type)!.push(entity);
});
const analysis = Array.from(types.entries()).map(([type, entities]) => ({
type,
count: entities.length,
sampleEntities: entities.slice(0, 5).map(e => e.canonical),
suggestions: entities.length === 1
? ['Only 1 entity - consider merging with related type']
: []
}));
return {
types: analysis,
recommendations: analysis.filter(t => t.count === 1).length > 0
? ['Consider consolidating underused types']
: []
};
}
// 2. Analyze predicates
function analyzePredicates(): PredicateAnalysis {
const predicateCounts = new Map<string, number>();
const predicateExamples = new Map<string, string[]>();
graph.triples.forEach(triple => {
const p = triple.predicate;
predicateCounts.set(p, (predicateCounts.get(p) || 0) + 1);
if (!predicateExamples.has(p)) predicateExamples.set(p, []);
if (predicateExamples.get(p)!.length < 3) {
predicateExamples.get(p)!.push(`${triple.subject} → ${triple.object}`);
}
});
// Find duplicate patterns
const duplicates = findDuplicatePredicates(Array.from(predicateCounts.keys()));
return {
predicates: Array.from(predicateCounts.entries()).map(([pred, count]) => ({
predicate: pred,
count,
examples: predicateExamples.get(pred)!
})),
duplicates
};
}
// 3. Find similar predicates
function findDuplicatePredicates(predicates: string[]): DuplicateGroup[] {
const groups: DuplicateGroup[] = [];
// Pattern: "action_target" (e.g., has_skill, knows_skill)
const actionGroups = new Map<string, string[]>();
predicates.forEach(pred => {
const parts = pred.split('_');
if (parts.length >= 2) {
const action = parts[0]; // "has", "knows", "is"
if (!actionGroups.has(action)) actionGroups.set(action, []);
actionGroups.get(action)!.push(pred);
}
});
actionGroups.forEach((preds, action) => {
if (preds.length >= 2) {
groups.push({
predicates: preds,
reason: `All express '${action}' relationship`,
suggestedMerge: preds[0] // Use first as canonical
});
}
});
return groups;
}Entity Types: 12
- person, organization, skill, location, project, role, etc.
Predicates: 18
- works_at, employed_by, has_skill, knows_skill, possesses_skill,
located_in, based_in, works_in, leads_project, manages_project, etc.
Entity Reuse: 25%
Predicates per Item: 1.8
MECE Score: 35/100Issues: Too many specific types, fragmented predicates, low reuse
Entity Types: 7
- person, organization, concept, location, artifact
Predicates: 8
- works_at, has_skill, located_in, works_on, collaborates_with,
has_attribute, member_of, reports_to
Entity Reuse: 48%
Predicates per Item: 3.2
MECE Score: 68/100Improvements: Consolidated skill/role/project → concept, merged duplicate predicates
Entity Types: 5
- person, organization, concept, location, artifact
Predicates: 5
- works_at, has_concept, located_in, works_on, collaborates_with
Entity Reuse: 65%
Predicates per Item: 4.5
MECE Score: 87/100Final State: Highly generic types, minimal predicates, high reuse, fully MECE compliant
// ❌ BAD: Create without checking
await add_entity({ type: "skill", name: "ml" });
// ✅ GOOD: Search first
const result = await search_entity({
type: "skill",
name: "ml",
useSemanticSearch: true
});
if (!result.found) {
await add_entity({ type: "skill", name: "machine_learning" });
}// ❌ BAD: Compound concepts
person:senior_software_engineer_at_acme
// ✅ GOOD: Atomic entities + relationships
person:john_doe
-> has_role -> role:senior_engineer
-> works_at -> organization:acme_corp// ❌ BAD: Inconsistent formatting
"Machine Learning", "machine-learning", "MachineLearning"
// ✅ GOOD: Snake case, lowercase
"machine_learning"// Finds: "ml", "machine learning", "ML", "machine-learning"
const result = await search_entity({
name: "machine learning",
type: "skill",
useSemanticSearch: true // Enable vector similarity
});// Analyze logs periodically
const events = parseNDJSON('events.ndjson');
// Top tools called
const toolFreq = events
.filter(e => e.level === 'debug')
.reduce((acc, e) => {
acc[e.toolName] = (acc[e.toolName] || 0) + 1;
return acc;
}, {});
// Entity reuse rate
const searches = events.filter(e => e.toolName === 'search_entity');
const found = searches.filter(e => e.output?.found).length;
const reuseRate = (found / searches.length) * 100;
console.log(`Entity reuse: ${reuseRate}%`);-
Core System
- Entity and Triple data structures
- Graph with indices (byType, byCanonical, byAlias)
- Load/save to JSON file
-
MECE Mechanisms
- Vector embeddings (768-dim)
- Semantic search (cosine similarity ≥0.85)
- Dynamic synonym mapping
- Strict naming conventions
- Post-processing merge (similarity ≥0.90)
-
Refactoring System
- analyzeEntityTypes()
- analyzePredicates()
- consolidatePredicates()
- renameEntityType()
- renamePredicate()
- Auto-refactor every 10 items
-
Event Logging
- PINO-style NDJSON format
- Wrap all tools with logging
- Track: time, level, itemId, toolName, input, output, duration
- Log file rotation (optional)
-
Quality Assurance
- findIsolatedEntities()
- cleanupIsolatedEntities()
- Auto-cleanup on item completion
- Quality metrics (compression, MECE score)
-
MCP Tools (13)
- search_entity (semantic)
- add_entity (with dedup)
- add_triple
- list_entity_types
- list_entities_by_type
- list_predicates
- find_triples
- get_stats
- add_synonym
- merge_duplicates
- check_isolated_entities
- cleanup_isolated_entities
- mark_item_processed
-
Monitoring
- Event log viewer (web UI)
- Real-time metrics dashboard
- Refactoring history
This system implements a self-improving knowledge graph that:
- Enforces MECE through 4 complementary mechanisms
- Auto-refactors every 10 items to consolidate schema
- Logs all events for debugging and analysis
- Maintains quality through isolated entity detection
- Provides 13 tools for agentic knowledge extraction
- Tracks metrics (compression, connectivity, MECE score)
- Evolves continuously from specific → generic schema
Key Insight: The graph schema is not static—it's a living system that becomes more MECE-compliant as you process more data!