Build Knowledge Graph from GitHub Repositories
User Intent
Operation
Prerequisites
Complete Code Example (TypeScript)
import { Graphlit } from 'graphlit-client';
import {
ContentTypes,
EntityExtractionServiceTypes,
EntityState,
FeedServiceTypes,
FeedTypes,
ObservableTypes
} from 'graphlit-client/dist/generated/graphql-types';
const graphlit = new Graphlit();
console.log('=== Building Knowledge Graph from GitHub ===\n');
// Step 1: Create extraction workflow
console.log('Step 1: Creating entity extraction workflow...');
const workflow = await graphlit.createWorkflow({
name: "GitHub Entity Extraction",
extraction: {
jobs: [{
connector: {
type: EntityExtractionServiceTypes.ModelText,
extractedTypes: [
ObservableTypes.Repo,
ObservableTypes.Person,
ObservableTypes.Organization,
ObservableTypes.Software,
ObservableTypes.Category,
ObservableTypes.Label
]
}
}]
}
});
console.log(`✓ Workflow: ${workflow.createWorkflow.id}\n`);
// Step 2: Create GitHub repository feed
console.log('Step 2: Creating GitHub repository feed...');
const repoFeed = await graphlit.createFeed({
name: "Graphlit Samples Repo",
type: FeedTypes.Site,
site: {
type: FeedServiceTypes.GitHub,
github: {
repositoryOwner: 'graphlit',
repositoryName: 'graphlit-samples',
personalAccessToken: process.env.GITHUB_TOKEN!
},
allowedPaths: ['README.md', 'docs/**', 'python/**', 'nextjs/**'],
excludedPaths: ['**/node_modules/**', '**/dist/**']
},
workflow: { id: workflow.createWorkflow.id }
});
console.log(`✓ Repo Feed: ${repoFeed.createFeed.id}\n`);
// Step 3: Create GitHub issues feed
console.log('Step 3: Creating GitHub issues feed...');
const issuesFeed = await graphlit.createFeed({
name: "Graphlit Issues",
type: FeedTypes.Issue,
issue: {
type: FeedServiceTypes.GitHubIssues,
github: {
repositoryOwner: 'graphlit',
repositoryName: 'graphlit-samples',
personalAccessToken: process.env.GITHUB_TOKEN!
},
readLimit: 100
},
workflow: { id: workflow.createWorkflow.id }
});
console.log(`✓ Issues Feed: ${issuesFeed.createFeed.id}\n`);
// Step 4: Wait for sync
console.log('Step 4: Syncing repository...');
let repoDone = false;
let issuesDone = false;
while (!repoDone || !issuesDone) {
if (!repoDone) {
const repoStatus = await graphlit.isFeedDone(repoFeed.createFeed.id);
repoDone = repoStatus.isFeedDone.result;
}
if (!issuesDone) {
const issuesStatus = await graphlit.isFeedDone(issuesFeed.createFeed.id);
issuesDone = issuesStatus.isFeedDone.result;
}
if (!repoDone || !issuesDone) {
console.log(' Syncing... (checking again in 5s)');
await new Promise(resolve => setTimeout(resolve, 5000));
}
}
console.log('✓ Sync complete\n');
// Step 5: Query repository content
console.log('Step 5: Querying repository files...');
const repoFiles = await graphlit.queryContents({
types: [ContentTypes.File],
feeds: [{ id: repoFeed.createFeed.id }]
});
console.log(`✓ Synced ${repoFiles.contents.results.length} files\n`);
// Step 6: Query issues
console.log('Step 6: Querying issues...');
const issues = await graphlit.queryContents({
types: [ContentTypes.Issue],
feeds: [{ id: issuesFeed.createFeed.id }]
});
console.log(`✓ Synced ${issues.contents.results.length} issues\n`);
// Step 7: Extract repository entities
console.log('Step 7: Analyzing repository entities...\n');
// Get all Repo entities
const repos = await graphlit.queryObservables({
filter: { types: [ObservableTypes.Repo] }
});
console.log(`Repositories: ${repos.observables.results.length}`);
// Get contributors (Person entities)
const people = await graphlit.queryObservables({
filter: { types: [ObservableTypes.Person] }
});
console.log(`Contributors: ${people.observables.results.length}`);
// Get dependencies (Software entities)
const software = await graphlit.queryObservables({
filter: { types: [ObservableTypes.Software] }
});
console.log(`Software/Dependencies: ${software.observables.results.length}\n`);
// Step 8: Analyze issue labels
console.log('Step 8: Analyzing issue labels...\n');
const labelCounts = new Map<string, number>();
issues.contents.results.forEach(issue => {
(issue.issue?.labels || []).filter(Boolean).forEach(label => {
labelCounts.set(label!, (labelCounts.get(label!) || 0) + 1);
});
});
console.log('Most common issue labels:');
Array.from(labelCounts.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 5)
.forEach(([label, count]) => {
console.log(` ${label}: ${count} issues`);
});
// Step 9: Build contributor network
console.log('\nStep 9: Building contributor network...\n');
const contributors = new Map<string, {
files: number;
issues: number;
total: number;
}>();
// Count files by contributor (from observations)
repoFiles.contents.results.forEach(file => {
file.observations
?.filter(Boolean)
.filter(obs => obs?.type === ObservableTypes.Person)
.forEach(obs => {
const name = obs!.observable.name;
if (!contributors.has(name)) contributors.set(name, { files: 0, issues: 0, total: 0 });
contributors.get(name)!.files++;
contributors.get(name)!.total++;
});
});
// Count issues by contributor (from observations)
issues.contents.results.forEach(issue => {
issue.observations
?.filter(Boolean)
.filter(obs => obs?.type === ObservableTypes.Person)
.forEach(obs => {
const name = obs!.observable.name;
if (!contributors.has(name)) contributors.set(name, { files: 0, issues: 0, total: 0 });
contributors.get(name)!.issues++;
contributors.get(name)!.total++;
});
});
console.log('Top contributors:');
Array.from(contributors.entries())
.sort((a, b) => b[1].total - a[1].total)
.slice(0, 5)
.forEach(([name, stats]) => {
console.log(` ${name}: ${stats.files} files, ${stats.issues} issues`);
});
console.log('\n✓ Repository analysis complete!');Step-by-Step Explanation
Step 1: Create Entity Extraction Workflow
Step 2: Configure GitHub Repository Feed
Step 3: Configure GitHub Issues Feed
Step 4: GitHub Token Setup
Step 5: Analyze Repository Files
Step 6: Analyze GitHub Issues
Step 7: Build Contributor Graph
Step 8: Dependency Analysis
Configuration Options
Scope the repository sync
Limit issue backfill size
Variations
Variation 1: Multi-Repository Analysis
Variation 2: Dependency Graph Visualization
Variation 3: Issue Classification by Entities
Variation 4: Contributor Activity Timeline
Variation 5: Cross-Repository Entity Linking
Common Issues & Solutions
Issue: Large Repository, Slow Sync
Issue: GitHub API Rate Limiting
Issue: Missing Dependencies from package.json
Issue: No Contributor Entities
Developer Hints
GitHub Token Best Practices
File Type Recommendations
Performance Optimization
Entity Quality by Source
Production Patterns
Pattern from Graphlit Samples
Open Source Intelligence Use Cases
Last updated