pushing template

65893012 · Charles Thomas · 6418df09 · 65893012 · 65893012 · 65893012
--- a/.gitignore
+++ b/.gitignore
+# Idea files and dirs #
+#######################
+*.iml
+**/out/
+**/.idea/
+*.class
+# latex compilation files #
+###########################
+*.aux
+*.log
+*.dvi
+*.synctex.gz
+# OS generated files #
+######################
+.DS_Store
+.DS_Store?
+._*
+# Python cache #
+################
+**/__pycache__/
+*.pyc
--- a/template/gspan_mining/__init__.py
+++ b/template/gspan_mining/__init__.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .gspan import gSpan
+from .graphdatabase import GraphDatabase
+"""This package contains the gSpan implementation along with classes used to represent graphs and graph databases.
+WARNING: The content of this package should be left unchanged."""
--- a/template/gspan_mining/graph.py
+++ b/template/gspan_mining/graph.py
+"""Definitions of Edge, Vertex and Graph."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import itertools
+VACANT_EDGE_ID = -1
+VACANT_VERTEX_ID = -1
+VACANT_EDGE_LABEL = -1
+VACANT_VERTEX_LABEL = -1
+VACANT_GRAPH_ID = -1
+AUTO_EDGE_ID = -1
+class Edge(object):
+    """Edge class."""
+    def __init__(self,
+                 eid=VACANT_EDGE_ID,
+                 frm=VACANT_VERTEX_ID,
+                 to=VACANT_VERTEX_ID,
+                 elb=VACANT_EDGE_LABEL):
+        """Initialize Edge instance.
+        Args:
+            eid: edge id.
+            frm: source vertex id.
+            to: destination vertex id.
+            elb: edge label.
+        """
+        self.eid = eid
+        self.frm = frm
+        self.to = to
+        self.elb = elb
+class Vertex(object):
+    """Vertex class."""
+    def __init__(self,
+                 vid=VACANT_VERTEX_ID,
+                 vlb=VACANT_VERTEX_LABEL):
+        """Initialize Vertex instance.
+        Args:
+            vid: id of this vertex.
+            vlb: label of this vertex.
+        """
+        self.vid = vid
+        self.vlb = vlb
+        self.edges = dict()
+    def add_edge(self, eid, frm, to, elb):
+        """Add an outgoing edge."""
+        self.edges[to] = Edge(eid, frm, to, elb)
+class Graph(object):
+    """Graph class."""
+    def __init__(self,
+                 gid=VACANT_GRAPH_ID,
+                 is_undirected=True,
+                 eid_auto_increment=True):
+        """Initialize Graph instance.
+        Args:
+            gid: id of this graph.
+            is_undirected: whether this graph is directed or not.
+            eid_auto_increment: whether to increment edge ids automatically.
+        """
+        self.gid = gid
+        self.is_undirected = is_undirected
+        self.vertices = dict()
+        self.set_of_elb = collections.defaultdict(set)
+        self.set_of_vlb = collections.defaultdict(set)
+        self.eid_auto_increment = eid_auto_increment
+        self.counter = itertools.count()
+    def get_num_vertices(self):
+        """Return number of vertices in the graph."""
+        return len(self.vertices)
+    def add_vertex(self, vid, vlb):
+        """Add a vertex to the graph."""
+        if vid in self.vertices:
+            return self
+        self.vertices[vid] = Vertex(vid, vlb)
+        self.set_of_vlb[vlb].add(vid)
+        return self
+    def add_edge(self, eid, frm, to, elb):
+        """Add an edge to the graph."""
+        if (frm is self.vertices and
+                to in self.vertices and
+                to in self.vertices[frm].edges):
+            return self
+        if self.eid_auto_increment:
+            eid = next(self.counter)
+        self.vertices[frm].add_edge(eid, frm, to, elb)
+        self.set_of_elb[elb].add((frm, to))
+        if self.is_undirected:
+            self.vertices[to].add_edge(eid, to, frm, elb)
+            self.set_of_elb[elb].add((to, frm))
+        return self
+    def display(self):
+        """Display the graph as text."""
+        display_str = ''
+        print('t # {}'.format(self.gid))
+        for vid in self.vertices:
+            print('v {} {}'.format(vid, self.vertices[vid].vlb))
+            display_str += 'v {} {} '.format(vid, self.vertices[vid].vlb)
+        for frm in self.vertices:
+            edges = self.vertices[frm].edges
+            for to in edges:
+                if self.is_undirected:
+                    if frm < to:
+                        print('e {} {} {}'.format(frm, to, edges[to].elb))
+                        display_str += 'e {} {} {} '.format(
+                            frm, to, edges[to].elb)
+                else:
+                    print('e {} {} {}'.format(frm, to, edges[to].elb))
+                    display_str += 'e {} {} {}'.format(frm, to, edges[to].elb)
+        return display_str
+    def plot(self):
+        """Visualize the graph."""
+        try:
+            import networkx as nx
+            import matplotlib.pyplot as plt
+        except Exception as e:
+            print('Can not plot graph: {}'.format(e))
+            return
+        gnx = nx.Graph() if self.is_undirected else nx.DiGraph()
+        vlbs = {vid: v.vlb for vid, v in self.vertices.items()}
+        elbs = {}
+        for vid, v in self.vertices.items():
+            gnx.add_node(vid, label=v.vlb)
+        for vid, v in self.vertices.items():
+            for to, e in v.edges.items():
+                if (not self.is_undirected) or vid < to:
+                    gnx.add_edge(vid, to, label=e.elb)
+                    elbs[(vid, to)] = e.elb
+        fsize = (min(16, 1 * len(self.vertices)),
+                 min(16, 1 * len(self.vertices)))
+        plt.figure(3, figsize=fsize)
+        pos = nx.spectral_layout(gnx)
+        nx.draw_networkx(gnx, pos, arrows=True, with_labels=True, labels=vlbs)
+        nx.draw_networkx_edge_labels(gnx, pos, edge_labels=elbs)
+        plt.show()
--- a/template/gspan_mining/graphdatabase.py
+++ b/template/gspan_mining/graphdatabase.py
+"""Implementation of a gSpan graph database with two classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import codecs
+from .graph import AUTO_EDGE_ID
+from .graph import Graph
+class GraphDatabase(object):
+    def __init__(self):
+        self._graph_cnt = 0
+        self._graphs = dict()
+    def read_graphs(self, filename):
+        indexes = []
+        with codecs.open(filename, 'r', 'utf-8') as f:
+            lines = [line.strip() for line in f.readlines()]
+            tgraph = None
+            for i, line in enumerate(lines):
+                cols = line.split(' ')
+                if cols[0] == 't':
+                    if tgraph is not None:
+                        self._graphs[self._graph_cnt] = tgraph
+                        indexes.append(self._graph_cnt)
+                        self._graph_cnt += 1
+                        tgraph = None
+                    if cols[-1] == '-1':
+                        break
+                    tgraph = Graph(self._graph_cnt,
+                                   is_undirected=True,
+                                   eid_auto_increment=True)
+                elif cols[0] == 'v':
+                    tgraph.add_vertex(cols[1], cols[2])
+                elif cols[0] == 'e':
+                    tgraph.add_edge(AUTO_EDGE_ID, cols[1], cols[2], cols[3])
+            # adapt to input files that do not end with 't # -1'
+            if tgraph is not None:
+                self._graphs[self._graph_cnt] = tgraph
+                indexes.append(self._graph_cnt)
+        return indexes
--- a/template/gspan_mining/gspan.py
+++ b/template/gspan_mining/gspan.py
+"""Implementation of gSpan."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import copy
+import itertools
+import time
+from .graph import AUTO_EDGE_ID
+from .graph import Graph
+from .graph import VACANT_GRAPH_ID
+from .graph import VACANT_VERTEX_LABEL
+def record_timestamp(func):
+    """Record timestamp before and after call of `func`."""
+    def deco(self):
+        self.timestamps[func.__name__ + '_in'] = time.time()
+        func(self)
+        self.timestamps[func.__name__ + '_out'] = time.time()
+    return deco
+class DFSedge(object):
+    """DFSedge class."""
+    def __init__(self, frm, to, vevlb):
+        """Initialize DFSedge instance."""
+        self.frm = frm
+        self.to = to
+        self.vevlb = vevlb
+    def __eq__(self, other):
+        """Check equivalence of DFSedge."""
+        return (self.frm == other.frm and
+                self.to == other.to and
+                self.vevlb == other.vevlb)
+    def __ne__(self, other):
+        """Check if not equal."""
+        return not self.__eq__(other)
+    def __repr__(self):
+        """Represent DFScode in string way."""
+        return '(frm={}, to={}, vevlb={})'.format(
+            self.frm, self.to, self.vevlb
+        )
+class DFScode(list):
+    """DFScode is a list of DFSedge."""
+    def __init__(self):
+        """Initialize DFScode."""
+        super().__init__()
+        self.rmpath = list()
+    def __eq__(self, other):
+        """Check equivalence of DFScode."""
+        la, lb = len(self), len(other)
+        if la != lb:
+            return False
+        for i in range(la):
+            if self[i] != other[i]:
+                return False
+        return True
+    def __ne__(self, other):
+        """Check if not equal."""
+        return not self.__eq__(other)
+    def __repr__(self):
+        """Represent DFScode in string way."""
+        return ''.join(['[', ','.join(
+            [str(dfsedge) for dfsedge in self]), ']']
+                       )
+    def push_back(self, frm, to, vevlb):
+        """Update DFScode by adding one edge."""
+        self.append(DFSedge(frm, to, vevlb))
+        return self
+    def to_graph(self, gid=VACANT_GRAPH_ID, is_undirected=True):
+        """Construct a graph according to the dfs code."""
+        g = Graph(gid,
+                  is_undirected=is_undirected,
+                  eid_auto_increment=True)
+        for dfsedge in self:
+            frm, to, (vlb1, elb, vlb2) = dfsedge.frm, dfsedge.to, dfsedge.vevlb
+            if vlb1 != VACANT_VERTEX_LABEL:
+                g.add_vertex(frm, vlb1)
+            if vlb2 != VACANT_VERTEX_LABEL:
+                g.add_vertex(to, vlb2)
+            g.add_edge(AUTO_EDGE_ID, frm, to, elb)
+        return g
+    def from_graph(self, g):
+        """Build DFScode from graph `g`."""
+        raise NotImplementedError('Not inplemented yet.')
+    def build_rmpath(self):
+        """Build right most path."""
+        self.rmpath = list()
+        old_frm = None
+        for i in range(len(self) - 1, -1, -1):
+            dfsedge = self[i]
+            frm, to = dfsedge.frm, dfsedge.to
+            if frm < to and (old_frm is None or to == old_frm):
+                self.rmpath.append(i)
+                old_frm = frm
+        return self
+    def get_num_vertices(self):
+        """Return number of vertices in the corresponding graph."""
+        return len(set(
+            [dfsedge.frm for dfsedge in self] +
+            [dfsedge.to for dfsedge in self]
+        ))
+class PDFS(object):
+    """PDFS class."""
+    def __init__(self, gid=VACANT_GRAPH_ID, edge=None, prev=None):
+        """Initialize PDFS instance."""
+        self.gid = gid
+        self.edge = edge
+        self.prev = prev
+class Projected(list):
+    """Projected is a list of PDFS.
+    Each element of Projected is a projection one frequent graph in one
+    original graph.
+    """
+    def __init__(self):
+        """Initialize Projected instance."""
+        super(Projected, self).__init__()
+    def push_back(self, gid, edge, prev):
+        """Update this Projected instance."""
+        self.append(PDFS(gid, edge, prev))
+        return self
+class History(object):
+    """History class."""
+    def __init__(self, g, pdfs):
+        """Initialize History instance."""
+        super(History, self).__init__()
+        self.edges = list()
+        self.vertices_used = collections.defaultdict(int)
+        self.edges_used = collections.defaultdict(int)
+        if pdfs is None:
+            return
+        while pdfs:
+            e = pdfs.edge
+            self.edges.append(e)
+            (self.vertices_used[e.frm],
+             self.vertices_used[e.to],
+             self.edges_used[e.eid]) = 1, 1, 1
+            pdfs = pdfs.prev
+        self.edges = self.edges[::-1]
+    def has_vertex(self, vid):
+        """Check if the vertex with vid exists in the history."""
+        return self.vertices_used[vid] == 1
+    def has_edge(self, eid):
+        """Check if the edge with eid exists in the history."""
+        return self.edges_used[eid] == 1
+class gSpan(object):
+    """`gSpan` algorithm."""
+    def __init__(self,
+                 task,
+                 min_num_vertices=1,
+                 max_num_vertices=float('inf'),
+                 is_undirected=True,
+                 verbose=False,
+                 visualize=False,
+                 where=False):
+        """Initialize gSpan instance."""
+        self._is_undirected = is_undirected
+        self._task = task
+        self._database = task.database
+        self._min_num_vertices = min_num_vertices
+        self._max_num_vertices = max_num_vertices
+        self._DFScode = DFScode()
+        self._support = 0
+        self._frequent_size1_subgraphs = list()
+        # Include subgraphs with
+        # any num(but >= 2, <= max_num_vertices) of vertices.
+        self._counter = itertools.count()
+        self._verbose = verbose
+        self._visualize = visualize
+        self._where = where
+        self.timestamps = dict()
+        if self._max_num_vertices < self._min_num_vertices:
+            print('Max number of vertices can not be smaller than '
+                  'min number of that.\n'
+                  'Set max_num_vertices = min_num_vertices.')
+            self._max_num_vertices = self._min_num_vertices
+    def time_stats(self):
+        """Print stats of time."""
+        func_names = ['run']
+        time_deltas = collections.defaultdict(float)
+        for fn in func_names:
+            time_deltas[fn] = round(
+                self.timestamps[fn + '_out'] - self.timestamps[fn + '_in'],
+                2
+            )
+        print('Total:\t{} s'.format(time_deltas['run']))
+        return self
+    def _get_gid_subsets(self, projected):
+        subsets = [[] for _ in self._task.gid_subsets]
+        gids = set([g.gid for g in projected])
+        for gid in gids:
+            subsets[self._gid_subset_ids[gid]].append(gid)
+        return subsets
+    @record_timestamp
+    def run(self):
+        """Run the gSpan algorithm."""
+        root = collections.defaultdict(Projected)
+        gids = set([gid for gid_subset in self._task.gid_subsets for gid in gid_subset])
+        self._gid_subset_ids = {}
+        for i, gid_subset in enumerate(self._task.gid_subsets):
+            for gid in gid_subset:
+                self._gid_subset_ids[gid] = i
+        for gid in gids:
+            g = self._database._graphs[gid]
+            for vid, v in g.vertices.items():
+                edges = self._get_forward_root_edges(g, vid)
+                for e in edges:
+                    root[(v.vlb, e.elb, g.vertices[e.to].vlb)].append(
+                        PDFS(gid, e, None)
+                    )
+        for vevlb, projected in root.items():
+            self._DFScode.append(DFSedge(0, 1, vevlb))
+            self._subgraph_mining(projected)
+            self._DFScode.pop()
+    def _report(self, projected):
+        self._frequent_subgraphs.append(copy.copy(self._DFScode))
+        if self._DFScode.get_num_vertices() < self._min_num_vertices:
+            return
+        g = self._DFScode.to_graph(gid=next(self._counter),
+                                   is_undirected=self._is_undirected)
+        display_str = g.display()
+        print('\nSupport: {}'.format(self._support))
+        if self._visualize:
+            g.plot()
+        if self._where:
+            print('where: {}'.format(list(set([p.gid for p in projected]))))
+        print('\n-----------------\n')
+    def print_results(self):
+        for i, subgraph in enumerate(self._frequent_subgraphs):
+            g = subgraph.to_graph(gid=next(self._counter),
+                                  is_undirected=self._is_undirected)
+            g.display()
+            print(self._subgraph_occurrences[i])
+    def _get_forward_root_edges(self, g, frm):
+        result = []
+        v_frm = g.vertices[frm]
+        for to, e in v_frm.edges.items():
+            if (not self._is_undirected) or v_frm.vlb <= g.vertices[to].vlb:
+                result.append(e)
+        return result
+    def _get_backward_edge(self, g, e1, e2, history):
+        if self._is_undirected and e1 == e2:
+            return None
+        for to, e in g.vertices[e2.to].edges.items():
+            if history.has_edge(e.eid) or e.to != e1.frm:
+                continue
+            # if reture here, then self._DFScodep[0] != dfs_code_min[0]
+            # should be checked in _is_min(). or:
+            if self._is_undirected:
+                if e1.elb < e.elb or (
+                        e1.elb == e.elb and
+                        g.vertices[e1.to].vlb <= g.vertices[e2.to].vlb):
+                    return e
+            else:
+                if g.vertices[e1.frm].vlb < g.vertices[e2.to] or (
+                        g.vertices[e1.frm].vlb == g.vertices[e2.to] and
+                        e1.elb <= e.elb):
+                    return e
+            # if e1.elb < e.elb or (e1.elb == e.elb and
+            #     g.vertices[e1.to].vlb <= g.vertices[e2.to].vlb):
+            #     return e
+        return None
+    def _get_forward_pure_edges(self, g, rm_edge, min_vlb, history):
+        result = []
+        for to, e in g.vertices[rm_edge.to].edges.items():
+            if min_vlb <= g.vertices[e.to].vlb and (
+                    not history.has_vertex(e.to)):
+                result.append(e)
+        return result
+    def _get_forward_rmpath_edges(self, g, rm_edge, min_vlb, history):
+        result = []
+        to_vlb = g.vertices[rm_edge.to].vlb
+        for to, e in g.vertices[rm_edge.frm].edges.items():
+            new_to_vlb = g.vertices[to].vlb
+            if (rm_edge.to == e.to or
+                    min_vlb > new_to_vlb or
+                    history.has_vertex(e.to)):
+                continue
+            if rm_edge.elb < e.elb or (rm_edge.elb == e.elb and
+                                       to_vlb <= new_to_vlb):
+                result.append(e)
+        return result
+    def _is_min(self):
+        if self._verbose:
+            print('is_min: checking {}'.format(self._DFScode))
+        if len(self._DFScode) == 1:
+            return True
+        g = self._DFScode.to_graph(gid=VACANT_GRAPH_ID,
+                                   is_undirected=self._is_undirected)
+        dfs_code_min = DFScode()
+        root = collections.defaultdict(Projected)
+        for vid, v in g.vertices.items():
+            edges = self._get_forward_root_edges(g, vid)
+            for e in edges:
+                root[(v.vlb, e.elb, g.vertices[e.to].vlb)].append(
+                    PDFS(g.gid, e, None))
+        min_vevlb = min(root.keys())
+        dfs_code_min.append(DFSedge(0, 1, min_vevlb))
+        # No need to check if is min code because of pruning in get_*_edge*.
+        def project_is_min(projected):
+            dfs_code_min.build_rmpath()
+            rmpath = dfs_code_min.rmpath
+            min_vlb = dfs_code_min[0].vevlb[0]
+            maxtoc = dfs_code_min[rmpath[0]].to
+            backward_root = collections.defaultdict(Projected)
+            flag, newto = False, 0,
+            end = 0 if self._is_undirected else -1
+            for i in range(len(rmpath) - 1, end, -1):
+                if flag:
+                    break
+                for p in projected:
+                    history = History(g, p)
+                    e = self._get_backward_edge(g,
+                                                history.edges[rmpath[i]],
+                                                history.edges[rmpath[0]],
+                                                history)
+                    if e is not None:
+                        backward_root[e.elb].append(PDFS(g.gid, e, p))
+                        newto = dfs_code_min[rmpath[i]].frm
+                        flag = True
+            if flag:
+                backward_min_elb = min(backward_root.keys())
+                dfs_code_min.append(DFSedge(
+                    maxtoc, newto,
+                    (VACANT_VERTEX_LABEL,
+                     backward_min_elb,
+                     VACANT_VERTEX_LABEL)
+                ))
+                idx = len(dfs_code_min) - 1
+                if self._DFScode[idx] != dfs_code_min[idx]:
+                    return False
+                return project_is_min(backward_root[backward_min_elb])
+            forward_root = collections.defaultdict(Projected)
+            flag, newfrm = False, 0
+            for p in projected:
+                history = History(g, p)
+                edges = self._get_forward_pure_edges(g,
+                                                     history.edges[rmpath[0]],
+                                                     min_vlb,
+                                                     history)
+                if len(edges) > 0:
+                    flag = True
+                    newfrm = maxtoc
+                    for e in edges:
+                        forward_root[
+                            (e.elb, g.vertices[e.to].vlb)
+                        ].append(PDFS(g.gid, e, p))
+            for rmpath_i in rmpath:
+                if flag:
+                    break
+                for p in projected:
+                    history = History(g, p)
+                    edges = self._get_forward_rmpath_edges(g,
+                                                           history.edges[
+                                                               rmpath_i],
+                                                           min_vlb,
+                                                           history)
+                    if len(edges) > 0:
+                        flag = True
+                        newfrm = dfs_code_min[rmpath_i].frm
+                        for e in edges:
+                            forward_root[
+                                (e.elb, g.vertices[e.to].vlb)
+                            ].append(PDFS(g.gid, e, p))
+            if not flag:
+                return True
+            forward_min_evlb = min(forward_root.keys())
+            dfs_code_min.append(DFSedge(
+                newfrm, maxtoc + 1,
+                (VACANT_VERTEX_LABEL, forward_min_evlb[0], forward_min_evlb[1]))
+            )
+            idx = len(dfs_code_min) - 1
+            if self._DFScode[idx] != dfs_code_min[idx]:
+                return False
+            return project_is_min(forward_root[forward_min_evlb])
+        res = project_is_min(root[min_vevlb])
+        return res
+    def _subgraph_mining(self, projected):
+        gid_subsets = self._get_gid_subsets(projected)
+        if self._task.prune(gid_subsets):
+            return
+        if not self._is_min():
+            return
+        self._task.store(repr(self._DFScode), gid_subsets)
+        num_vertices = self._DFScode.get_num_vertices()
+        self._DFScode.build_rmpath()
+        rmpath = self._DFScode.rmpath
+        maxtoc = self._DFScode[rmpath[0]].to
+        min_vlb = self._DFScode[0].vevlb[0]
+        forward_root = collections.defaultdict(Projected)
+        backward_root = collections.defaultdict(Projected)
+        for p in projected:
+            g = self._database._graphs[p.gid]
+            history = History(g, p)
+            # backward
+            for rmpath_i in rmpath[::-1]:
+                e = self._get_backward_edge(g,
+                                            history.edges[rmpath_i],
+                                            history.edges[rmpath[0]],
+                                            history)
+                if e is not None:
+                    backward_root[
+                        (self._DFScode[rmpath_i].frm, e.elb)
+                    ].append(PDFS(g.gid, e, p))
+            # pure forward
+            if num_vertices >= self._max_num_vertices:
+                continue
+            edges = self._get_forward_pure_edges(g,
+                                                 history.edges[rmpath[0]],
+                                                 min_vlb,
+                                                 history)
+            for e in edges:
+                forward_root[
+                    (maxtoc, e.elb, g.vertices[e.to].vlb)
+                ].append(PDFS(g.gid, e, p))
+            # rmpath forward
+            for rmpath_i in rmpath:
+                edges = self._get_forward_rmpath_edges(g,
+                                                       history.edges[rmpath_i],
+                                                       min_vlb,
+                                                       history)
+                for e in edges:
+                    forward_root[
+                        (self._DFScode[rmpath_i].frm,
+                         e.elb, g.vertices[e.to].vlb)
+                    ].append(PDFS(g.gid, e, p))
+        # backward
+        for to, elb in backward_root:
+            self._DFScode.append(DFSedge(
+                maxtoc, to,
+                (VACANT_VERTEX_LABEL, elb, VACANT_VERTEX_LABEL))
+            )
+            self._subgraph_mining(backward_root[(to, elb)])
+            self._DFScode.pop()
+        # forward
+        # No need to check if num_vertices >= self._max_num_vertices.
+        # Because forward_root has no element.
+        for frm, elb, vlb2 in forward_root:
+            self._DFScode.append(DFSedge(
+                frm, maxtoc + 1,
+                (VACANT_VERTEX_LABEL, elb, vlb2))
+            )
+            self._subgraph_mining(forward_root[(frm, elb, vlb2)])
+            self._DFScode.pop()
+        return self
--- a/template/main.py
+++ b/template/main.py
+"""The main program that runs gSpan. Two examples are provided"""
+# -*- coding=utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import numpy
+from sklearn import naive_bayes
+from sklearn import metrics
+from gspan_mining import gSpan
+from gspan_mining import GraphDatabase
+class PatternGraphs:
+	"""
+	This template class is used to define a task for the gSpan implementation.
+	You should not modify this class but extend it to define new tasks
+	"""
+	def __init__(self, database):
+		# A list of subsets of graph identifiers.
+		# Is used to specify different groups of graphs (classes and training/test sets).
+		# The gid-subsets parameter in the pruning and store function will contain for each subset, all the occurrences
+		# in which the examined pattern is present.
+		self.gid_subsets = []
+		self.database = database  # A graphdatabase instance: contains the data for the problem.
+	def store(self, dfs_code, gid_subsets):
+		"""
+		Code to be executed to store the pattern, if desired.
+		The function will only be called for patterns that have not been pruned.
+		In correlated pattern mining, we may prune based on confidence, but then check further conditions before storing.
+		:param dfs_code: the dfs code of the pattern (as a string).
+		:param gid_subsets: the cover (set of graph ids in which the pattern is present) for each subset in self.gid_subsets
+		"""
+		print("Please implement the store function in a subclass for a specific mining task!")
+	def prune(self, gid_subsets):
+		"""
+		prune function: used by the gSpan algorithm to know if a pattern (and its children in the search tree)
+		should be pruned.
+		:param gid_subsets: A list of the cover of the pattern for each subset.
+		:return: true if the pattern should be pruned, false otherwise.
+		"""
+		print("Please implement the prune function in a subclass for a specific mining task!")
+class FrequentPositiveGraphs(PatternGraphs):
+	"""
+	Finds the frequent (support >= minsup) subgraphs among the positive graphs.
+	This class provides a method to build a feature matrix for each subset.
+	"""
+	def __init__(self, minsup, database, subsets):
+		"""
+		Initialize the task.
+		:param minsup: the minimum positive support
+		:param database: the graph database
+		:param subsets: the subsets (train and/or test sets for positive and negative class) of graph ids.
+		"""
+		super().__init__(database)
+		self.patterns = []  # The patterns found in the end (as dfs codes represented by strings) with their cover (as a list of graph ids).
+		self.minsup = minsup
+		self.gid_subsets = subsets
+	# Stores any pattern found that has not been pruned
+	def store(self, dfs_code, gid_subsets):
+		self.patterns.append((dfs_code, gid_subsets))
+	# Prunes any pattern that is not frequent in the positive class
+	def prune(self, gid_subsets):
+		# first subset is the set of positive ids
+		return len(gid_subsets[0]) < self.minsup
+	# creates a column for a feature matrix
+	def create_fm_col(self, all_gids, subset_gids):
+		subset_gids = set(subset_gids)
+		bools = []
+		for i, val in enumerate(all_gids):
+			if val in subset_gids:
+				bools.append(1)
+			else:
+				bools.append(0)
+		return bools
+	# return a feature matrix for each subset of examples, in which the columns correspond to patterns
+	# and the rows to examples in the subset.
+	def get_feature_matrices(self):
+		matrices = [[] for _ in self.gid_subsets]
+		for pattern, gid_subsets in self.patterns:
+			for i, gid_subset in enumerate(gid_subsets):
+				matrices[i].append(self.create_fm_col(self.gid_subsets[i], gid_subset))
+		return [numpy.array(matrix).transpose() for matrix in matrices]
+def example1():
+	"""
+	Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
+	with a minimum positive support of minsup and prints them.
+	"""
+	args = sys.argv
+	database_file_name_pos = args[1]  # First parameter: path to positive class file
+	database_file_name_neg = args[2]  # Second parameter: path to negative class file
+	minsup = int(args[3])  # Third parameter: minimum support
+	if not os.path.exists(database_file_name_pos):
+		print('{} does not exist.'.format(database_file_name_pos))
+		sys.exit()
+	if not os.path.exists(database_file_name_neg):
+		print('{} does not exist.'.format(database_file_name_neg))
+		sys.exit()
+	graph_database = GraphDatabase()  # Graph database object
+	pos_ids = graph_database.read_graphs(database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
+	neg_ids = graph_database.read_graphs(database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids
+	subsets = [pos_ids, neg_ids]  # The ids for the positive and negative labelled graphs in the database
+	task = FrequentPositiveGraphs(minsup, graph_database, subsets)  # Creating task
+	gSpan(task).run()  # Running gSpan
+	# Printing frequent patterns along with their positive support:
+	for pattern, gid_subsets in task.patterns:
+		pos_support = len(gid_subsets[0])  # This will have to be replaced by the confidence and support on both classes
+		print('{} {}'.format(pattern, pos_support))
+def example2():
+	"""
+	Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of
+	the positive class with a minimum support of minsup.
+	Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on
+	the test set.
+	Performs a k-fold cross-validation.
+	"""
+	args = sys.argv
+	database_file_name_pos = args[1]  # First parameter: path to positive class file
+	database_file_name_neg = args[2]  # Second parameter: path to negative class file
+	minsup = int(args[3])  # Third parameter: minimum support (note: this parameter will be k in case of top-k mining)
+	nfolds = int(args[4])  # Fourth parameter: number of folds to use in the k-fold cross-validation.
+	if not os.path.exists(database_file_name_pos):
+		print('{} does not exist.'.format(database_file_name_pos))
+		sys.exit()
+	if not os.path.exists(database_file_name_neg):
+		print('{} does not exist.'.format(database_file_name_neg))
+		sys.exit()
+	graph_database = GraphDatabase()  # Graph database object
+	pos_ids = graph_database.read_graphs(database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
+	neg_ids = graph_database.read_graphs(database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids
+	# If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!)
+	if nfolds < 2:
+		subsets = [
+			pos_ids,  # Positive training set
+			pos_ids,  # Positive test set
+			neg_ids,  # Negative training set
+			neg_ids  # Negative test set
+		]
+		# Printing fold number:
+		print('fold {}'.format(1))
+		train_and_evaluate(minsup, graph_database, subsets)
+	# Otherwise: performs k-fold cross-validation:
+	else:
+		pos_fold_size = len(pos_ids) // nfolds
+		neg_fold_size = len(neg_ids) // nfolds
+		for i in range(nfolds):
+			# Use fold as test set, the others as training set for each class;
+			# identify all the subsets to be maintained by the graph mining algorithm.
+			subsets = [
+				numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])),  # Positive training set
+				pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size],  # Positive test set
+				numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])),  # Negative training set
+				neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size],  # Negative test set
+			]
+			# Printing fold number:
+			print('fold {}'.format(i+1))
+			train_and_evaluate(minsup, graph_database, subsets)
+def train_and_evaluate(minsup, database, subsets):
+	task = FrequentPositiveGraphs(minsup, database, subsets)  # Creating task
+	gSpan(task).run()  # Running gSpan
+	# Creating feature matrices for training and testing:
+	features = task.get_feature_matrices()
+	train_fm = numpy.concatenate((features[0], features[2]))  # Training feature matrix
+	train_labels = numpy.concatenate((numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int)))  # Training labels
+	test_fm = numpy.concatenate((features[1], features[3]))  # Testing feature matrix
+	test_labels = numpy.concatenate((numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int)))  # Testing labels
+	classifier = naive_bayes.GaussianNB()  # Creating model object
+	classifier.fit(train_fm, train_labels)  # Training model
+	predicted = classifier.predict(test_fm)  # Using model to predict labels of testing data
+	accuracy = metrics.accuracy_score(test_labels, predicted)  # Computing accuracy:
+	# Printing frequent patterns along with their positive support:
+	for pattern, gid_subsets in task.patterns:
+		pos_support = len(gid_subsets[0])
+		print('{} {}'.format(pattern, pos_support))
+	# printing classification results:
+	print(predicted)
+	print('accuracy: {}'.format(accuracy))
+	print()  # Blank line to indicate end of fold.
+if __name__ == '__main__':
+	example1()
+	# example2()