webgraph_swh

Python bindings for Software Heritage graph access.

View Source

1from ._webgraph_swh import *
2
3__doc__ = _webgraph_swh.__doc__
4if hasattr(_webgraph_swh, "__all__"):
5    __all__ = _webgraph_swh.__all__

class SwhGraph:

A bidirectional Software Heritage graph with node properties.

Loads the graph and all available properties (maps, persons, strings, timestamps) from the given base path. Node IDs are integers in [0 . . num_nodes).

SwhGraph(path: str)

def num_nodes(self) -> int:

Return the number of nodes in the graph.

def num_arcs(self) -> int:

Return the number of arcs in the graph.

def outdegree(self, node: int) -> int:

Return the number of successors of the given node.

Raises IndexError if node is out of range.

def indegree(self, node: int) -> int:

Return the number of predecessors of the given node.

Raises IndexError if node is out of range.

def predecessors(self, node: int) -> PySuccessorsIterator:

Return an iterator over the predecessors of the given node.

Raises IndexError if node is out of range.

def successors(self, node: int) -> PySuccessorsIterator:

Return an iterator over the successors of the given node.

Raises IndexError if node is out of range.

def committer_id(self, node: int) -> int | None:

Return the committer person ID, or None if not available.

Raises IndexError if node is out of range.

def author_id(self, node: int) -> int | None:

Return the author person ID, or None if not available.

Raises IndexError if node is out of range.

def node_type(self, node: int) -> PyNodeType:

Return the node type as a PyNodeType enum value.

Raises IndexError if node is out of range.

def committer_timestamp(self, node: int) -> int | None:

Return the committer timestamp (seconds since epoch), or None.

Raises IndexError if node is out of range.

def author_timestamp(self, node: int) -> int | None:

Return the author timestamp (seconds since epoch), or None.

Raises IndexError if node is out of range.

def swhid(self, node: int) -> str:

Return the SWHID of the given node as a string.

Raises IndexError if node is out of range.

def swh_link(self, node: int) -> str:

Return the URL of the Software Heritage archive page for the given node (e.g., https://archive.softwareheritage.org/swh:1:rev:...).

Raises IndexError if node is out of range.

def message(self, node: int) -> str | None:

Return the commit/tag message, or None if not available.

Raises IndexError if node is out of range.

def tag_name(self, node: int) -> str | None:

Return the tag name, or None if not a release or not available.

Raises IndexError if node is out of range.

def outdegrees(self) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint32]]:

Return a numpy uint32 array of outdegrees for all nodes, computed in parallel. The array is indexed by node ID.

def indegrees(self) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint32]]:

Return a numpy uint32 array of indegrees for all nodes, computed in parallel. The array is indexed by node ID.

def top_k_out( self, k: int) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint64]]:

Return a (k, 2) array: column 0 = node IDs, column 1 = outdegrees.

def top_k_in( self, k: int) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint64]]:

Return a (k, 2) array: column 0 = node IDs, column 1 = indegrees.

def node_type_freqs(self) -> dict[str, int]:

Return node-type frequencies as a dictionary, computed in parallel.

Dictionary keys are node-type names (Content, Directory, Origin, Release, Revision, Snapshot).

def contributor_node_counts(self, contributor_id: int) -> dict[str, int]:

Count nodes where contributor_id appears as committer, author, or both.

Returns {"committer": N, "author": M, "both": K}. Computed in parallel.

def bfs(self) -> PyBfsIterator:

BFS over all connected components.

Yields (root, parent, node, distance) tuples where root identifies the BFS tree (starting node of the component), parent is the node from which node was discovered (equal to node for roots), and distance is the hop count from root.

def bfs_from_node(self, node: int) -> PyBfsIterator:

BFS from a single starting node.

Yields (root, parent, node, distance) tuples where root is always the starting node, parent is the node from which node was discovered (equal to node for the root), and distance is the hop count from the starting node.

Raises IndexError if node is out of range.

def subgraph(self, node_types: str) -> FilteredSwhGraph:

Return a FilteredSwhGraph restricted to the given node types.

The constraint string is a comma-separated list of type abbreviations (cnt, dir, ori, rel, rev, snp) or * for all types.

Example::

revrel = g.subgraph("rev,rel")

def forward_graph(self) -> BvGraph:

Load the forward BvGraph from the same base path.

Returns a webgraph.BvGraph instance.

def backward_graph(self) -> BvGraph:

Load the backward (transposed) BvGraph from the same base path.

Returns a webgraph.BvGraph instance.

basepath: str

The base path from which the graph was loaded.

class FilteredSwhGraph:

A view of an SwhGraph restricted to specific node types.

Created by SwhGraph.subgraph(). Node IDs are not renumbered.

def num_nodes(self) -> int:

Return the number of nodes in the underlying (unfiltered) graph.

def num_nodes_sub(self) -> int:

Return the number of nodes matching the node-type constraint.

def outdegree(self, node: int) -> int:

Return the number of successors matching the node-type constraint.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def indegree(self, node: int) -> int:

Return the number of predecessors matching the node-type constraint.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def successors(self, node: int) -> PySuccessorsIterator:

Return an iterator over successors matching the node-type constraint.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def predecessors(self, node: int) -> PySuccessorsIterator:

Return an iterator over predecessors matching the node-type constraint.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def outdegrees(self) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint32]]:

Return a numpy array of filtered outdegrees for all nodes, computed in parallel.

Nodes not matching the constraint have degree 0, so that array[node_id] is the filtered outdegree for matching nodes.

def indegrees(self) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint32]]:

Return a numpy array of filtered indegrees for all nodes, computed in parallel.

Nodes not matching the constraint have degree 0, so that array[node_id] is the filtered indegree for matching nodes.

def node_type_freqs(self) -> dict[str, int]:

Return node-type frequencies as a dictionary, computed in parallel.

Only nodes matching the constraint are counted. Dictionary keys are node-type names (Content, Directory, Origin, Release, Revision, Snapshot).

def contributor_node_counts(self, contributor_id: int) -> dict[str, int]:

Count nodes matching the constraint where contributor_id appears as committer, author, or both.

Returns {"committer": N, "author": M, "both": K}. Computed in parallel.

def committer_id(self, node: int) -> int | None:

Return the committer person ID, or None if not available.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def author_id(self, node: int) -> int | None:

Return the author person ID, or None if not available.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def node_type(self, node: int) -> PyNodeType:

Return the node type as a PyNodeType enum value.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def committer_timestamp(self, node: int) -> int | None:

Return the committer timestamp (seconds since epoch), or None.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def author_timestamp(self, node: int) -> int | None:

Return the author timestamp (seconds since epoch), or None.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def swhid(self, node: int) -> str:

Return the SWHID of the given node as a string.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def swh_link(self, node: int) -> str:

Return the URL of the Software Heritage archive page for the given node (e.g., https://archive.softwareheritage.org/swh:1:rev:...).

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def message(self, node: int) -> str | None:

Return the commit/tag message, or None if not available.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def tag_name(self, node: int) -> str | None:

Return the tag name, or None if not a release or not available.

Raises IndexError if node is out of range, or ValueError if the node does not match the constraint.

def top_k_out( self, k: int) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint64]]:

Return a (k, 2) array: column 0 = node IDs, column 1 = outdegrees.

def top_k_in( self, k: int) -> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.uint64]]:

Return a (k, 2) array: column 0 = node IDs, column 1 = indegrees.

def has_node(self, node: int) -> bool:

Return whether the given node matches the node-type constraint.

class ContributorNamesMap:

Sparse map from contributor IDs to display names.

ContributorNamesMap(path: str)

def get_name(self, contributor_id: int) -> str | None:

Return the display name for the given contributor ID, or None.

class PyNodeType:

SWH node types.

Integer values match the encoding used in the SWH graph: Content=0, Directory=1, Origin=2, Release=3, Revision=4, Snapshot=5.

Content = PyNodeType.Content

Directory = PyNodeType.Directory

Origin = PyNodeType.Origin

Release = PyNodeType.Release

Revision = PyNodeType.Revision

Snapshot = PyNodeType.Snapshot

class PySuccessorsIterator:

Iterator over node IDs (successors or predecessors).

class PyBfsIterator:

Iterator for breadth-first traversal.

Yields (root, parent, node, distance) tuples. When traversing all components, root identifies which component the node belongs to.