Build Knowledge Graph from PDF Documents
Use Case: Build Knowledge Graph from PDF Documents
User Intent
Operation
Prerequisites
Complete Code Example (TypeScript)
import { Graphlit } from 'graphlit-client';
import {
FilePreparationServiceTypes,
EntityExtractionServiceTypes,
ObservableTypes,
EntityState
} from 'graphlit-client/dist/generated/graphql-types';
const graphlit = new Graphlit();
console.log('=== Building Knowledge Graph from PDF ===\n');
// Step 1: Create extraction workflow
console.log('Step 1: Creating extraction workflow...');
const workflow = await graphlit.createWorkflow({
name: "PDF Entity Extraction",
preparation: {
jobs: [{
connector: {
type: FilePreparationServiceTypes.ModelDocument // PDF, Word, Excel, etc.
}
}]
},
extraction: {
jobs: [{
connector: {
type: EntityExtractionServiceTypes.ModelText,
extractedTypes: [
ObservableTypes.Person,
ObservableTypes.Organization,
ObservableTypes.Place,
ObservableTypes.Event
]
}
}]
}
});
console.log(`✓ Created workflow: ${workflow.createWorkflow.id}\n`);
// Step 2: Ingest PDF
console.log('Step 2: Ingesting PDF document...');
const content = await graphlit.ingestUri('https://arxiv.org/pdf/2301.00001.pdf', "Research Paper", undefined, undefined, undefined, { id: workflow.createWorkflow.id });
console.log(`✓ Ingested: ${content.ingestUri.id}\n`);
// Step 3: Wait for processing
console.log('Step 3: Waiting for entity extraction...');
let isDone = false;
while (!isDone) {
const status = await graphlit.isContentDone(content.ingestUri.id);
isDone = status.isContentDone.result;
if (!isDone) {
console.log(' Processing... (checking again in 2s)');
await new Promise(resolve => setTimeout(resolve, 2000));
}
}
console.log('✓ Extraction complete\n');
// Step 4: Retrieve content with entities
console.log('Step 4: Retrieving extracted entities...');
const contentDetails = await graphlit.getContent(content.ingestUri.id);
const observations = contentDetails.content.observations || [];
console.log(`✓ Found ${observations.length} entity observations\n`);
// Step 5: Analyze entities by type
console.log('Step 5: Analyzing entities...\n');
const byType = new Map<string, Set<string>>();
observations.forEach(obs => {
if (!byType.has(obs.type)) {
byType.set(obs.type, new Set());
}
byType.get(obs.type)!.add(obs.observable.name);
});
byType.forEach((entities, type) => {
console.log(`${type} (${entities.size} unique):`);
Array.from(entities).slice(0, 5).forEach(name => {
console.log(` - ${name}`);
});
if (entities.size > 5) {
console.log(` ... and ${entities.size - 5} more`);
}
console.log();
});
// Step 6: Query knowledge graph
console.log('Step 6: Querying knowledge graph...\n');
// Get all unique people
const people = await graphlit.queryObservables({
filter: {
types: [ObservableTypes.Person],
states: [EntityState.Enabled]
}
});
console.log(`Total people in knowledge graph: ${people.observables.results.length}`);
// Get all organizations
const orgs = await graphlit.queryObservables({
filter: {
types: [ObservableTypes.Organization],
states: [EntityState.Enabled]
}
});
console.log(`Total organizations in knowledge graph: ${orgs.observables.results.length}`);
// Step 7: Find entity relationships
console.log('\nStep 7: Analyzing entity co-occurrences...\n');
const cooccurrences: Array<{ person: string; organization: string; count: number }> = [];
observations
.filter(obs => obs.type === ObservableTypes.Person)
.forEach(personObs => {
observations
.filter(obs => obs.type === ObservableTypes.Organization)
.forEach(orgObs => {
// Check if they appear on same pages
const personPages = new Set(
personObs.occurrences?.map(occ => occ.pageIndex) || []
);
const orgPages = new Set(
orgObs.occurrences?.map(occ => occ.pageIndex) || []
);
const sharedPages = Array.from(personPages).filter(p => orgPages.has(p));
if (sharedPages.length > 0) {
cooccurrences.push({
person: personObs.observable.name,
organization: orgObs.observable.name,
count: sharedPages.length
});
}
});
});
console.log('Top person-organization relationships:');
cooccurrences
.sort((a, b) => b.count - a.count)
.slice(0, 5)
.forEach(({ person, organization, count }) => {
console.log(` ${person} ↔ ${organization} (${count} pages)`);
});
console.log('\n✓ Knowledge graph analysis complete!');Run
Step-by-Step Explanation
Step 1: Create Extraction Workflow
Step 2: Ingest PDF Document
Step 3: Poll for Completion
Step 4: Retrieve Extracted Entities
Step 5: Analyze Entities
Step 6: Query Knowledge Graph
Step 7: Analyze Relationships
Configuration Options
Choosing Text vs Vision Extraction
Model Selection for Quality vs Speed
Variations
Variation 1: Legal Contract Analysis
Variation 2: Research Paper Citation Network
Variation 3: Invoice/Receipt Processing
Variation 4: Medical Records Analysis
Variation 5: Batch PDF Processing
Common Issues & Solutions
Issue: No Entities Extracted from Scanned PDF
Issue: Encrypted PDF Won't Process
Issue: Missing Entities from Images/Charts
Issue: Processing Takes Too Long
Developer Hints
PDF Processing Best Practices
Vision Model Selection
Cost Optimization
Performance Optimization
Production Patterns
Pattern from Graphlit Samples
Pattern from Legal Tech
Last updated