diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..97131e8c4c58f675835744ce1129d9fc51747751 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# Idea files and dirs # +####################### +*.iml +**/out/ +**/.idea/ +*.class + +# latex compilation files # +########################### +*.aux +*.log +*.dvi +*.synctex.gz + +# OS generated files # +###################### +.DS_Store +.DS_Store? +._* + +# Python cache # +################ +**/__pycache__/ +*.pyc diff --git a/template/gspan_mining/__init__.py b/template/gspan_mining/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5d66ce39d52860c29bb033cc5ee47e9fe4d5cb1d --- /dev/null +++ b/template/gspan_mining/__init__.py @@ -0,0 +1,9 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from .gspan import gSpan +from .graphdatabase import GraphDatabase + +"""This package contains the gSpan implementation along with classes used to represent graphs and graph databases. +WARNING: The content of this package should be left unchanged.""" diff --git a/template/gspan_mining/graph.py b/template/gspan_mining/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..6e64b6a3f966d4fb1a61b35acbb3cab265e0c68a --- /dev/null +++ b/template/gspan_mining/graph.py @@ -0,0 +1,154 @@ +"""Definitions of Edge, Vertex and Graph.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import itertools + + +VACANT_EDGE_ID = -1 +VACANT_VERTEX_ID = -1 +VACANT_EDGE_LABEL = -1 +VACANT_VERTEX_LABEL = -1 +VACANT_GRAPH_ID = -1 +AUTO_EDGE_ID = -1 + + +class Edge(object): + """Edge class.""" + + def __init__(self, + eid=VACANT_EDGE_ID, + frm=VACANT_VERTEX_ID, + to=VACANT_VERTEX_ID, + elb=VACANT_EDGE_LABEL): + """Initialize Edge instance. + + Args: + eid: edge id. + frm: source vertex id. + to: destination vertex id. + elb: edge label. + """ + self.eid = eid + self.frm = frm + self.to = to + self.elb = elb + + +class Vertex(object): + """Vertex class.""" + + def __init__(self, + vid=VACANT_VERTEX_ID, + vlb=VACANT_VERTEX_LABEL): + """Initialize Vertex instance. + + Args: + vid: id of this vertex. + vlb: label of this vertex. + """ + self.vid = vid + self.vlb = vlb + self.edges = dict() + + def add_edge(self, eid, frm, to, elb): + """Add an outgoing edge.""" + self.edges[to] = Edge(eid, frm, to, elb) + + +class Graph(object): + """Graph class.""" + + def __init__(self, + gid=VACANT_GRAPH_ID, + is_undirected=True, + eid_auto_increment=True): + """Initialize Graph instance. + + Args: + gid: id of this graph. + is_undirected: whether this graph is directed or not. + eid_auto_increment: whether to increment edge ids automatically. + """ + self.gid = gid + self.is_undirected = is_undirected + self.vertices = dict() + self.set_of_elb = collections.defaultdict(set) + self.set_of_vlb = collections.defaultdict(set) + self.eid_auto_increment = eid_auto_increment + self.counter = itertools.count() + + def get_num_vertices(self): + """Return number of vertices in the graph.""" + return len(self.vertices) + + def add_vertex(self, vid, vlb): + """Add a vertex to the graph.""" + if vid in self.vertices: + return self + self.vertices[vid] = Vertex(vid, vlb) + self.set_of_vlb[vlb].add(vid) + return self + + def add_edge(self, eid, frm, to, elb): + """Add an edge to the graph.""" + if (frm is self.vertices and + to in self.vertices and + to in self.vertices[frm].edges): + return self + if self.eid_auto_increment: + eid = next(self.counter) + self.vertices[frm].add_edge(eid, frm, to, elb) + self.set_of_elb[elb].add((frm, to)) + if self.is_undirected: + self.vertices[to].add_edge(eid, to, frm, elb) + self.set_of_elb[elb].add((to, frm)) + return self + + def display(self): + """Display the graph as text.""" + display_str = '' + print('t # {}'.format(self.gid)) + for vid in self.vertices: + print('v {} {}'.format(vid, self.vertices[vid].vlb)) + display_str += 'v {} {} '.format(vid, self.vertices[vid].vlb) + for frm in self.vertices: + edges = self.vertices[frm].edges + for to in edges: + if self.is_undirected: + if frm < to: + print('e {} {} {}'.format(frm, to, edges[to].elb)) + display_str += 'e {} {} {} '.format( + frm, to, edges[to].elb) + else: + print('e {} {} {}'.format(frm, to, edges[to].elb)) + display_str += 'e {} {} {}'.format(frm, to, edges[to].elb) + return display_str + + def plot(self): + """Visualize the graph.""" + try: + import networkx as nx + import matplotlib.pyplot as plt + except Exception as e: + print('Can not plot graph: {}'.format(e)) + return + gnx = nx.Graph() if self.is_undirected else nx.DiGraph() + vlbs = {vid: v.vlb for vid, v in self.vertices.items()} + elbs = {} + for vid, v in self.vertices.items(): + gnx.add_node(vid, label=v.vlb) + for vid, v in self.vertices.items(): + for to, e in v.edges.items(): + if (not self.is_undirected) or vid < to: + gnx.add_edge(vid, to, label=e.elb) + elbs[(vid, to)] = e.elb + fsize = (min(16, 1 * len(self.vertices)), + min(16, 1 * len(self.vertices))) + plt.figure(3, figsize=fsize) + pos = nx.spectral_layout(gnx) + nx.draw_networkx(gnx, pos, arrows=True, with_labels=True, labels=vlbs) + nx.draw_networkx_edge_labels(gnx, pos, edge_labels=elbs) + plt.show() diff --git a/template/gspan_mining/graphdatabase.py b/template/gspan_mining/graphdatabase.py new file mode 100644 index 0000000000000000000000000000000000000000..ab66c82418a6c8a27c07243a7b1f840103a1711d --- /dev/null +++ b/template/gspan_mining/graphdatabase.py @@ -0,0 +1,44 @@ +"""Implementation of a gSpan graph database with two classes.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import codecs + +from .graph import AUTO_EDGE_ID +from .graph import Graph + + +class GraphDatabase(object): + + def __init__(self): + self._graph_cnt = 0 + self._graphs = dict() + + def read_graphs(self, filename): + indexes = [] + with codecs.open(filename, 'r', 'utf-8') as f: + lines = [line.strip() for line in f.readlines()] + tgraph = None + for i, line in enumerate(lines): + cols = line.split(' ') + if cols[0] == 't': + if tgraph is not None: + self._graphs[self._graph_cnt] = tgraph + indexes.append(self._graph_cnt) + self._graph_cnt += 1 + tgraph = None + if cols[-1] == '-1': + break + tgraph = Graph(self._graph_cnt, + is_undirected=True, + eid_auto_increment=True) + elif cols[0] == 'v': + tgraph.add_vertex(cols[1], cols[2]) + elif cols[0] == 'e': + tgraph.add_edge(AUTO_EDGE_ID, cols[1], cols[2], cols[3]) + # adapt to input files that do not end with 't # -1' + if tgraph is not None: + self._graphs[self._graph_cnt] = tgraph + indexes.append(self._graph_cnt) + return indexes diff --git a/template/gspan_mining/gspan.py b/template/gspan_mining/gspan.py new file mode 100644 index 0000000000000000000000000000000000000000..44333269ba9e8939a9baddcf0419d7e37a9ed7c1 --- /dev/null +++ b/template/gspan_mining/gspan.py @@ -0,0 +1,509 @@ +"""Implementation of gSpan.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import itertools +import time + +from .graph import AUTO_EDGE_ID +from .graph import Graph +from .graph import VACANT_GRAPH_ID +from .graph import VACANT_VERTEX_LABEL + + +def record_timestamp(func): + """Record timestamp before and after call of `func`.""" + + def deco(self): + self.timestamps[func.__name__ + '_in'] = time.time() + func(self) + self.timestamps[func.__name__ + '_out'] = time.time() + + return deco + + +class DFSedge(object): + """DFSedge class.""" + + def __init__(self, frm, to, vevlb): + """Initialize DFSedge instance.""" + self.frm = frm + self.to = to + self.vevlb = vevlb + + def __eq__(self, other): + """Check equivalence of DFSedge.""" + return (self.frm == other.frm and + self.to == other.to and + self.vevlb == other.vevlb) + + def __ne__(self, other): + """Check if not equal.""" + return not self.__eq__(other) + + def __repr__(self): + """Represent DFScode in string way.""" + return '(frm={}, to={}, vevlb={})'.format( + self.frm, self.to, self.vevlb + ) + + +class DFScode(list): + """DFScode is a list of DFSedge.""" + + def __init__(self): + """Initialize DFScode.""" + super().__init__() + self.rmpath = list() + + def __eq__(self, other): + """Check equivalence of DFScode.""" + la, lb = len(self), len(other) + if la != lb: + return False + for i in range(la): + if self[i] != other[i]: + return False + return True + + def __ne__(self, other): + """Check if not equal.""" + return not self.__eq__(other) + + def __repr__(self): + """Represent DFScode in string way.""" + return ''.join(['[', ','.join( + [str(dfsedge) for dfsedge in self]), ']'] + ) + + def push_back(self, frm, to, vevlb): + """Update DFScode by adding one edge.""" + self.append(DFSedge(frm, to, vevlb)) + return self + + def to_graph(self, gid=VACANT_GRAPH_ID, is_undirected=True): + """Construct a graph according to the dfs code.""" + g = Graph(gid, + is_undirected=is_undirected, + eid_auto_increment=True) + for dfsedge in self: + frm, to, (vlb1, elb, vlb2) = dfsedge.frm, dfsedge.to, dfsedge.vevlb + if vlb1 != VACANT_VERTEX_LABEL: + g.add_vertex(frm, vlb1) + if vlb2 != VACANT_VERTEX_LABEL: + g.add_vertex(to, vlb2) + g.add_edge(AUTO_EDGE_ID, frm, to, elb) + return g + + def from_graph(self, g): + """Build DFScode from graph `g`.""" + raise NotImplementedError('Not inplemented yet.') + + def build_rmpath(self): + """Build right most path.""" + self.rmpath = list() + old_frm = None + for i in range(len(self) - 1, -1, -1): + dfsedge = self[i] + frm, to = dfsedge.frm, dfsedge.to + if frm < to and (old_frm is None or to == old_frm): + self.rmpath.append(i) + old_frm = frm + return self + + def get_num_vertices(self): + """Return number of vertices in the corresponding graph.""" + return len(set( + [dfsedge.frm for dfsedge in self] + + [dfsedge.to for dfsedge in self] + )) + + +class PDFS(object): + """PDFS class.""" + + def __init__(self, gid=VACANT_GRAPH_ID, edge=None, prev=None): + """Initialize PDFS instance.""" + self.gid = gid + self.edge = edge + self.prev = prev + + +class Projected(list): + """Projected is a list of PDFS. + + Each element of Projected is a projection one frequent graph in one + original graph. + """ + + def __init__(self): + """Initialize Projected instance.""" + super(Projected, self).__init__() + + def push_back(self, gid, edge, prev): + """Update this Projected instance.""" + self.append(PDFS(gid, edge, prev)) + return self + + +class History(object): + """History class.""" + + def __init__(self, g, pdfs): + """Initialize History instance.""" + super(History, self).__init__() + self.edges = list() + self.vertices_used = collections.defaultdict(int) + self.edges_used = collections.defaultdict(int) + if pdfs is None: + return + while pdfs: + e = pdfs.edge + self.edges.append(e) + (self.vertices_used[e.frm], + self.vertices_used[e.to], + self.edges_used[e.eid]) = 1, 1, 1 + + pdfs = pdfs.prev + self.edges = self.edges[::-1] + + def has_vertex(self, vid): + """Check if the vertex with vid exists in the history.""" + return self.vertices_used[vid] == 1 + + def has_edge(self, eid): + """Check if the edge with eid exists in the history.""" + return self.edges_used[eid] == 1 + + +class gSpan(object): + """`gSpan` algorithm.""" + + def __init__(self, + task, + min_num_vertices=1, + max_num_vertices=float('inf'), + is_undirected=True, + verbose=False, + visualize=False, + where=False): + """Initialize gSpan instance.""" + self._is_undirected = is_undirected + self._task = task + self._database = task.database + self._min_num_vertices = min_num_vertices + self._max_num_vertices = max_num_vertices + self._DFScode = DFScode() + self._support = 0 + self._frequent_size1_subgraphs = list() + # Include subgraphs with + # any num(but >= 2, <= max_num_vertices) of vertices. + self._counter = itertools.count() + self._verbose = verbose + self._visualize = visualize + self._where = where + self.timestamps = dict() + if self._max_num_vertices < self._min_num_vertices: + print('Max number of vertices can not be smaller than ' + 'min number of that.\n' + 'Set max_num_vertices = min_num_vertices.') + self._max_num_vertices = self._min_num_vertices + + def time_stats(self): + """Print stats of time.""" + func_names = ['run'] + time_deltas = collections.defaultdict(float) + for fn in func_names: + time_deltas[fn] = round( + self.timestamps[fn + '_out'] - self.timestamps[fn + '_in'], + 2 + ) + + print('Total:\t{} s'.format(time_deltas['run'])) + + return self + + def _get_gid_subsets(self, projected): + subsets = [[] for _ in self._task.gid_subsets] + gids = set([g.gid for g in projected]) + for gid in gids: + subsets[self._gid_subset_ids[gid]].append(gid) + return subsets + + @record_timestamp + def run(self): + """Run the gSpan algorithm.""" + root = collections.defaultdict(Projected) + gids = set([gid for gid_subset in self._task.gid_subsets for gid in gid_subset]) + self._gid_subset_ids = {} + for i, gid_subset in enumerate(self._task.gid_subsets): + for gid in gid_subset: + self._gid_subset_ids[gid] = i + + for gid in gids: + g = self._database._graphs[gid] + for vid, v in g.vertices.items(): + edges = self._get_forward_root_edges(g, vid) + for e in edges: + root[(v.vlb, e.elb, g.vertices[e.to].vlb)].append( + PDFS(gid, e, None) + ) + + for vevlb, projected in root.items(): + self._DFScode.append(DFSedge(0, 1, vevlb)) + self._subgraph_mining(projected) + self._DFScode.pop() + + def _report(self, projected): + self._frequent_subgraphs.append(copy.copy(self._DFScode)) + if self._DFScode.get_num_vertices() < self._min_num_vertices: + return + g = self._DFScode.to_graph(gid=next(self._counter), + is_undirected=self._is_undirected) + display_str = g.display() + print('\nSupport: {}'.format(self._support)) + + if self._visualize: + g.plot() + if self._where: + print('where: {}'.format(list(set([p.gid for p in projected])))) + print('\n-----------------\n') + + def print_results(self): + for i, subgraph in enumerate(self._frequent_subgraphs): + g = subgraph.to_graph(gid=next(self._counter), + is_undirected=self._is_undirected) + g.display() + print(self._subgraph_occurrences[i]) + + def _get_forward_root_edges(self, g, frm): + result = [] + v_frm = g.vertices[frm] + for to, e in v_frm.edges.items(): + if (not self._is_undirected) or v_frm.vlb <= g.vertices[to].vlb: + result.append(e) + return result + + def _get_backward_edge(self, g, e1, e2, history): + if self._is_undirected and e1 == e2: + return None + for to, e in g.vertices[e2.to].edges.items(): + if history.has_edge(e.eid) or e.to != e1.frm: + continue + # if reture here, then self._DFScodep[0] != dfs_code_min[0] + # should be checked in _is_min(). or: + if self._is_undirected: + if e1.elb < e.elb or ( + e1.elb == e.elb and + g.vertices[e1.to].vlb <= g.vertices[e2.to].vlb): + return e + else: + if g.vertices[e1.frm].vlb < g.vertices[e2.to] or ( + g.vertices[e1.frm].vlb == g.vertices[e2.to] and + e1.elb <= e.elb): + return e + # if e1.elb < e.elb or (e1.elb == e.elb and + # g.vertices[e1.to].vlb <= g.vertices[e2.to].vlb): + # return e + return None + + def _get_forward_pure_edges(self, g, rm_edge, min_vlb, history): + result = [] + for to, e in g.vertices[rm_edge.to].edges.items(): + if min_vlb <= g.vertices[e.to].vlb and ( + not history.has_vertex(e.to)): + result.append(e) + return result + + def _get_forward_rmpath_edges(self, g, rm_edge, min_vlb, history): + result = [] + to_vlb = g.vertices[rm_edge.to].vlb + for to, e in g.vertices[rm_edge.frm].edges.items(): + new_to_vlb = g.vertices[to].vlb + if (rm_edge.to == e.to or + min_vlb > new_to_vlb or + history.has_vertex(e.to)): + continue + if rm_edge.elb < e.elb or (rm_edge.elb == e.elb and + to_vlb <= new_to_vlb): + result.append(e) + return result + + def _is_min(self): + if self._verbose: + print('is_min: checking {}'.format(self._DFScode)) + if len(self._DFScode) == 1: + return True + g = self._DFScode.to_graph(gid=VACANT_GRAPH_ID, + is_undirected=self._is_undirected) + dfs_code_min = DFScode() + root = collections.defaultdict(Projected) + for vid, v in g.vertices.items(): + edges = self._get_forward_root_edges(g, vid) + for e in edges: + root[(v.vlb, e.elb, g.vertices[e.to].vlb)].append( + PDFS(g.gid, e, None)) + min_vevlb = min(root.keys()) + dfs_code_min.append(DFSedge(0, 1, min_vevlb)) + + # No need to check if is min code because of pruning in get_*_edge*. + + def project_is_min(projected): + dfs_code_min.build_rmpath() + rmpath = dfs_code_min.rmpath + min_vlb = dfs_code_min[0].vevlb[0] + maxtoc = dfs_code_min[rmpath[0]].to + + backward_root = collections.defaultdict(Projected) + flag, newto = False, 0, + end = 0 if self._is_undirected else -1 + for i in range(len(rmpath) - 1, end, -1): + if flag: + break + for p in projected: + history = History(g, p) + e = self._get_backward_edge(g, + history.edges[rmpath[i]], + history.edges[rmpath[0]], + history) + if e is not None: + backward_root[e.elb].append(PDFS(g.gid, e, p)) + newto = dfs_code_min[rmpath[i]].frm + flag = True + if flag: + backward_min_elb = min(backward_root.keys()) + dfs_code_min.append(DFSedge( + maxtoc, newto, + (VACANT_VERTEX_LABEL, + backward_min_elb, + VACANT_VERTEX_LABEL) + )) + idx = len(dfs_code_min) - 1 + if self._DFScode[idx] != dfs_code_min[idx]: + return False + return project_is_min(backward_root[backward_min_elb]) + + forward_root = collections.defaultdict(Projected) + flag, newfrm = False, 0 + for p in projected: + history = History(g, p) + edges = self._get_forward_pure_edges(g, + history.edges[rmpath[0]], + min_vlb, + history) + if len(edges) > 0: + flag = True + newfrm = maxtoc + for e in edges: + forward_root[ + (e.elb, g.vertices[e.to].vlb) + ].append(PDFS(g.gid, e, p)) + for rmpath_i in rmpath: + if flag: + break + for p in projected: + history = History(g, p) + edges = self._get_forward_rmpath_edges(g, + history.edges[ + rmpath_i], + min_vlb, + history) + if len(edges) > 0: + flag = True + newfrm = dfs_code_min[rmpath_i].frm + for e in edges: + forward_root[ + (e.elb, g.vertices[e.to].vlb) + ].append(PDFS(g.gid, e, p)) + + if not flag: + return True + + forward_min_evlb = min(forward_root.keys()) + dfs_code_min.append(DFSedge( + newfrm, maxtoc + 1, + (VACANT_VERTEX_LABEL, forward_min_evlb[0], forward_min_evlb[1])) + ) + idx = len(dfs_code_min) - 1 + if self._DFScode[idx] != dfs_code_min[idx]: + return False + return project_is_min(forward_root[forward_min_evlb]) + + res = project_is_min(root[min_vevlb]) + return res + + def _subgraph_mining(self, projected): + gid_subsets = self._get_gid_subsets(projected) + if self._task.prune(gid_subsets): + return + if not self._is_min(): + return + self._task.store(repr(self._DFScode), gid_subsets) + + num_vertices = self._DFScode.get_num_vertices() + self._DFScode.build_rmpath() + rmpath = self._DFScode.rmpath + maxtoc = self._DFScode[rmpath[0]].to + min_vlb = self._DFScode[0].vevlb[0] + + forward_root = collections.defaultdict(Projected) + backward_root = collections.defaultdict(Projected) + for p in projected: + g = self._database._graphs[p.gid] + history = History(g, p) + # backward + for rmpath_i in rmpath[::-1]: + e = self._get_backward_edge(g, + history.edges[rmpath_i], + history.edges[rmpath[0]], + history) + if e is not None: + backward_root[ + (self._DFScode[rmpath_i].frm, e.elb) + ].append(PDFS(g.gid, e, p)) + # pure forward + if num_vertices >= self._max_num_vertices: + continue + edges = self._get_forward_pure_edges(g, + history.edges[rmpath[0]], + min_vlb, + history) + for e in edges: + forward_root[ + (maxtoc, e.elb, g.vertices[e.to].vlb) + ].append(PDFS(g.gid, e, p)) + # rmpath forward + for rmpath_i in rmpath: + edges = self._get_forward_rmpath_edges(g, + history.edges[rmpath_i], + min_vlb, + history) + for e in edges: + forward_root[ + (self._DFScode[rmpath_i].frm, + e.elb, g.vertices[e.to].vlb) + ].append(PDFS(g.gid, e, p)) + + # backward + for to, elb in backward_root: + self._DFScode.append(DFSedge( + maxtoc, to, + (VACANT_VERTEX_LABEL, elb, VACANT_VERTEX_LABEL)) + ) + self._subgraph_mining(backward_root[(to, elb)]) + self._DFScode.pop() + # forward + # No need to check if num_vertices >= self._max_num_vertices. + # Because forward_root has no element. + for frm, elb, vlb2 in forward_root: + self._DFScode.append(DFSedge( + frm, maxtoc + 1, + (VACANT_VERTEX_LABEL, elb, vlb2)) + ) + self._subgraph_mining(forward_root[(frm, elb, vlb2)]) + self._DFScode.pop() + + return self diff --git a/template/main.py b/template/main.py new file mode 100644 index 0000000000000000000000000000000000000000..17dd296cc1c29115e86b23c697f69d1ae173f180 --- /dev/null +++ b/template/main.py @@ -0,0 +1,220 @@ +"""The main program that runs gSpan. Two examples are provided""" +# -*- coding=utf-8 -*- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import numpy +from sklearn import naive_bayes +from sklearn import metrics + +from gspan_mining import gSpan +from gspan_mining import GraphDatabase + + +class PatternGraphs: + """ + This template class is used to define a task for the gSpan implementation. + You should not modify this class but extend it to define new tasks + """ + + def __init__(self, database): + # A list of subsets of graph identifiers. + # Is used to specify different groups of graphs (classes and training/test sets). + # The gid-subsets parameter in the pruning and store function will contain for each subset, all the occurrences + # in which the examined pattern is present. + self.gid_subsets = [] + + self.database = database # A graphdatabase instance: contains the data for the problem. + + def store(self, dfs_code, gid_subsets): + """ + Code to be executed to store the pattern, if desired. + The function will only be called for patterns that have not been pruned. + In correlated pattern mining, we may prune based on confidence, but then check further conditions before storing. + :param dfs_code: the dfs code of the pattern (as a string). + :param gid_subsets: the cover (set of graph ids in which the pattern is present) for each subset in self.gid_subsets + """ + print("Please implement the store function in a subclass for a specific mining task!") + + def prune(self, gid_subsets): + """ + prune function: used by the gSpan algorithm to know if a pattern (and its children in the search tree) + should be pruned. + :param gid_subsets: A list of the cover of the pattern for each subset. + :return: true if the pattern should be pruned, false otherwise. + """ + print("Please implement the prune function in a subclass for a specific mining task!") + + +class FrequentPositiveGraphs(PatternGraphs): + """ + Finds the frequent (support >= minsup) subgraphs among the positive graphs. + This class provides a method to build a feature matrix for each subset. + """ + + def __init__(self, minsup, database, subsets): + """ + Initialize the task. + :param minsup: the minimum positive support + :param database: the graph database + :param subsets: the subsets (train and/or test sets for positive and negative class) of graph ids. + """ + super().__init__(database) + self.patterns = [] # The patterns found in the end (as dfs codes represented by strings) with their cover (as a list of graph ids). + self.minsup = minsup + self.gid_subsets = subsets + + # Stores any pattern found that has not been pruned + def store(self, dfs_code, gid_subsets): + self.patterns.append((dfs_code, gid_subsets)) + + # Prunes any pattern that is not frequent in the positive class + def prune(self, gid_subsets): + # first subset is the set of positive ids + return len(gid_subsets[0]) < self.minsup + + # creates a column for a feature matrix + def create_fm_col(self, all_gids, subset_gids): + subset_gids = set(subset_gids) + bools = [] + for i, val in enumerate(all_gids): + if val in subset_gids: + bools.append(1) + else: + bools.append(0) + return bools + + # return a feature matrix for each subset of examples, in which the columns correspond to patterns + # and the rows to examples in the subset. + def get_feature_matrices(self): + matrices = [[] for _ in self.gid_subsets] + for pattern, gid_subsets in self.patterns: + for i, gid_subset in enumerate(gid_subsets): + matrices[i].append(self.create_fm_col(self.gid_subsets[i], gid_subset)) + return [numpy.array(matrix).transpose() for matrix in matrices] + + +def example1(): + """ + Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class + with a minimum positive support of minsup and prints them. + """ + + args = sys.argv + database_file_name_pos = args[1] # First parameter: path to positive class file + database_file_name_neg = args[2] # Second parameter: path to negative class file + minsup = int(args[3]) # Third parameter: minimum support + + if not os.path.exists(database_file_name_pos): + print('{} does not exist.'.format(database_file_name_pos)) + sys.exit() + if not os.path.exists(database_file_name_neg): + print('{} does not exist.'.format(database_file_name_neg)) + sys.exit() + + graph_database = GraphDatabase() # Graph database object + pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading positive graphs, adding them to database and getting ids + neg_ids = graph_database.read_graphs(database_file_name_neg) # Reading negative graphs, adding them to database and getting ids + + subsets = [pos_ids, neg_ids] # The ids for the positive and negative labelled graphs in the database + task = FrequentPositiveGraphs(minsup, graph_database, subsets) # Creating task + + gSpan(task).run() # Running gSpan + + # Printing frequent patterns along with their positive support: + for pattern, gid_subsets in task.patterns: + pos_support = len(gid_subsets[0]) # This will have to be replaced by the confidence and support on both classes + print('{} {}'.format(pattern, pos_support)) + + +def example2(): + """ + Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of + the positive class with a minimum support of minsup. + Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on + the test set. + Performs a k-fold cross-validation. + """ + + args = sys.argv + database_file_name_pos = args[1] # First parameter: path to positive class file + database_file_name_neg = args[2] # Second parameter: path to negative class file + minsup = int(args[3]) # Third parameter: minimum support (note: this parameter will be k in case of top-k mining) + nfolds = int(args[4]) # Fourth parameter: number of folds to use in the k-fold cross-validation. + + if not os.path.exists(database_file_name_pos): + print('{} does not exist.'.format(database_file_name_pos)) + sys.exit() + if not os.path.exists(database_file_name_neg): + print('{} does not exist.'.format(database_file_name_neg)) + sys.exit() + + graph_database = GraphDatabase() # Graph database object + pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading positive graphs, adding them to database and getting ids + neg_ids = graph_database.read_graphs(database_file_name_neg) # Reading negative graphs, adding them to database and getting ids + + # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!) + if nfolds < 2: + subsets = [ + pos_ids, # Positive training set + pos_ids, # Positive test set + neg_ids, # Negative training set + neg_ids # Negative test set + ] + # Printing fold number: + print('fold {}'.format(1)) + train_and_evaluate(minsup, graph_database, subsets) + + # Otherwise: performs k-fold cross-validation: + else: + pos_fold_size = len(pos_ids) // nfolds + neg_fold_size = len(neg_ids) // nfolds + for i in range(nfolds): + # Use fold as test set, the others as training set for each class; + # identify all the subsets to be maintained by the graph mining algorithm. + subsets = [ + numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set + pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set + numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set + neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set + ] + # Printing fold number: + print('fold {}'.format(i+1)) + train_and_evaluate(minsup, graph_database, subsets) + + +def train_and_evaluate(minsup, database, subsets): + task = FrequentPositiveGraphs(minsup, database, subsets) # Creating task + + gSpan(task).run() # Running gSpan + + # Creating feature matrices for training and testing: + features = task.get_feature_matrices() + train_fm = numpy.concatenate((features[0], features[2])) # Training feature matrix + train_labels = numpy.concatenate((numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels + test_fm = numpy.concatenate((features[1], features[3])) # Testing feature matrix + test_labels = numpy.concatenate((numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels + + classifier = naive_bayes.GaussianNB() # Creating model object + classifier.fit(train_fm, train_labels) # Training model + + predicted = classifier.predict(test_fm) # Using model to predict labels of testing data + + accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy: + + # Printing frequent patterns along with their positive support: + for pattern, gid_subsets in task.patterns: + pos_support = len(gid_subsets[0]) + print('{} {}'.format(pattern, pos_support)) + # printing classification results: + print(predicted) + print('accuracy: {}'.format(accuracy)) + print() # Blank line to indicate end of fold. + + +if __name__ == '__main__': + example1() + # example2()