Enrichment

Configure entity and content enrichment.

Entity Enrichment

Once you've extracted entities from your content, Graphlit offers additional capabilities to enrich those Persons, Organizations, etc.

Via third-party data sources, such as Diffbot, Wikipedia and Crunchbase, Graphlit can automatically lookup entities and add additional details to created entities.

As an example, if entity extraction identifies the organization named "OpenAI", by configuring the enrichment stage of the workflow, Graphlit will automatically fill in the company address, industries, or even revenue or investment. These properties are available for query in the GraphQL schema, for example, Organization has the properties foundingDate, industries, and address.

Entity enrichment occurs only when an observed entity is created, i.e. the first time the entity is observed. For every subsequent time the same entity is observed, an observation is created, linking the content and the observed entity, but the entity is not re-enriched.

You can assign the enrichedTypes property to limit which observed entity types will be enriched. If this is not assigned, all possible entity types for the enrichment service will be enriched.

Each entity enrichment service supports a subset of observed entity types.

Diffbot supports the enrichment of Organizations and Persons. Wikipedia supports the enrichment of Organizations, Persons, Places, Software, and Products. Crunchbase only supports the enrichment of Organizations.

Diffbot

Mutation:

mutation CreateWorkflow($workflow: WorkflowInput!) {
  createWorkflow(workflow: $workflow) {
    id
    name
    state
    extraction {
      jobs {
        connector {
          type
          contentTypes
          fileTypes
          extractedTypes
          azureText {
            confidenceThreshold
            enablePII
          }
          azureImage {
            confidenceThreshold
          }
        }
      }
    }
    enrichment {
      jobs {
        connector {
          type
          enrichedTypes
        }
      }
    }
  }
}

Variables:

{
  "workflow": {
    "extraction": {
      "jobs": [
        {
          "connector": {
            "type": "AZURE_COGNITIVE_SERVICES_TEXT",
            "azureText": {
              "confidenceThreshold": 0.8
            }
          }
        }
      ]
    },
    "enrichment": {
      "jobs": [
        {
          "connector": {
            "type": "DIFFBOT",
            "enrichedTypes": [
              "ORGANIZATION"
            ]
          }
        }
      ]
    },
    "name": "Diffbot Enrichment"
  }
}

Response:

{
  "extraction": {
    "jobs": [
      {
        "connector": {
          "type": "AZURE_COGNITIVE_SERVICES_TEXT",
          "azureText": {
            "confidenceThreshold": 0.8
          }
        }
      }
    ]
  },
  "enrichment": {
    "jobs": [
      {
        "connector": {
          "type": "DIFFBOT",
          "enrichedTypes": [
            "ORGANIZATION"
          ]
        }
      }
    ]
  },
  "id": "80ff5e17-2fe4-4211-9083-da2541472e48",
  "name": "Diffbot Enrichment",
  "state": "ENABLED"
}

Wikipedia

Mutation:

mutation CreateWorkflow($workflow: WorkflowInput!) {
  createWorkflow(workflow: $workflow) {
    id
    name
    state
    extraction {
      jobs {
        connector {
          type
          contentTypes
          fileTypes
          extractedTypes
          azureText {
            confidenceThreshold
            enablePII
          }
          azureImage {
            confidenceThreshold
          }
        }
      }
    }
    enrichment {
      jobs {
        connector {
          type
          enrichedTypes
        }
      }
    }
  }
}

Variables:

{
  "workflow": {
    "extraction": {
      "jobs": [
        {
          "connector": {
            "type": "AZURE_COGNITIVE_SERVICES_TEXT",
            "azureText": {
              "confidenceThreshold": 0.8
            }
          }
        }
      ]
    },
    "enrichment": {
      "jobs": [
        {
          "connector": {
            "type": "WIKIPEDIA"
          }
        }
      ]
    },
    "name": "Wikipedia Enrichment"
  }
}

Response:

{
  "extraction": {
    "jobs": [
      {
        "connector": {
          "type": "AZURE_COGNITIVE_SERVICES_TEXT",
          "azureText": {
            "confidenceThreshold": 0.8
          }
        }
      }
    ]
  },
  "enrichment": {
    "jobs": [
      {
        "connector": {
          "type": "WIKIPEDIA"
        }
      }
    ]
  },
  "id": "6d6dfe8c-0dad-40bf-a87d-d9b88e665ea8",
  "name": "Wikipedia Enrichment",
  "state": "ENABLED"
}

Crunchbase

Mutation:

mutation CreateWorkflow($workflow: WorkflowInput!) {
  createWorkflow(workflow: $workflow) {
    id
    name
    state
    extraction {
      jobs {
        connector {
          type
          contentTypes
          fileTypes
          extractedTypes
          azureText {
            confidenceThreshold
            enablePII
          }
          azureImage {
            confidenceThreshold
          }
        }
      }
    }
    enrichment {
      jobs {
        connector {
          type
          enrichedTypes
        }
      }
    }
  }
}

Variables:

{
  "workflow": {
    "extraction": {
      "jobs": [
        {
          "connector": {
            "type": "AZURE_COGNITIVE_SERVICES_TEXT",
            "azureText": {
              "confidenceThreshold": 0.8
            }
          }
        }
      ]
    },
    "enrichment": {
      "jobs": [
        {
          "connector": {
            "type": "CRUNCHBASE",
            "enrichedTypes": [
              "ORGANIZATION"
            ]
          }
        }
      ]
    },
    "name": "Crunchbase Enrichment"
  }
}

Response:

{
  "extraction": {
    "jobs": [
      {
        "connector": {
          "type": "AZURE_COGNITIVE_SERVICES_TEXT",
          "azureText": {
            "confidenceThreshold": 0.8
          }
        }
      }
    ]
  },
  "enrichment": {
    "jobs": [
      {
        "connector": {
          "type": "CRUNCHBASE",
          "enrichedTypes": [
            "ORGANIZATION"
          ]
        }
      }
    ]
  },
  "id": "d951296e-de54-446c-983f-376a07b5f7ab",
  "name": "Crunchbase Enrichment",
  "state": "ENABLED"
}

Content Enrichment

As content is ingested and text is extracted, there may be hyperlinks to external web pages or files located in the content.

Via "link crawling", Graphlit offers the ability to automatically ingest this linked content.

For example, if a web page has links to external web pages, and has a link to a PDF file, you can automatically crawl those links and have Graphlit ingest the linked content.

Links to web pages are called "web links", and links to file-based content are called "file links".

Mutation:

mutation CreateWorkflow($workflow: WorkflowInput!) {
  createWorkflow(workflow: $workflow) {
    id
    name
    state
    enrichment {
      link {
        enableCrawling
        allowedDomains
        excludedDomains
        allowedLinks
        excludedLinks
        allowedFiles
        excludedFiles
        allowContentDomain
        maximumLinks
      }
    }
  }
}

Variables:

{
  "workflow": {
    "enrichment": {
      "link": {
        "enableCrawling": true,
        "allowedDomains": [
          "openai.com"
        ],
        "excludedDomains": [
          "microsoft.com"
        ],
        "allowedLinks": [
          "WEB"
        ],
        "allowContentDomain": false,
        "maximumLinks": 10
      }
    },
    "name": "Enrichment Stage"
  }
}

Response:

{
  "enrichment": {
    "link": {
      "enableCrawling": true,
      "allowedDomains": [
        "openai.com"
      ],
      "excludedDomains": [
        "microsoft.com"
      ],
      "allowedLinks": [
        "WEB"
      ],
      "allowContentDomain": false,
      "maximumLinks": 10
    }
  },
  "id": "667499c6-771c-4f29-9fd9-85e9f00e3c9c",
  "name": "Enrichment Stage",
  "state": "ENABLED"
}

Mutation:

mutation CreateWorkflow($workflow: WorkflowInput!) {
  createWorkflow(workflow: $workflow) {
    id
    name
    state
    enrichment {
      link {
        enableCrawling
        allowedDomains
        excludedDomains
        allowedLinks
        excludedLinks
        allowedFiles
        excludedFiles
        allowContentDomain
        maximumLinks
      }
    }
  }
}

Variables:

{
  "workflow": {
    "enrichment": {
      "link": {
        "enableCrawling": true,
        "allowedLinks": [
          "FILE"
        ]
      }
    },
    "name": "Enrichment Stage"
  }
}

Response:

{
  "enrichment": {
    "link": {
      "enableCrawling": true,
      "allowedLinks": [
        "FILE"
      ],
      "allowContentDomain": false
    }
  },
  "id": "472ac277-bbf9-4cdc-b22f-aba03834237c",
  "name": "Enrichment Stage",
  "state": "ENABLED"
}

Last updated