Skip to content
Extraits de code Groupes Projets
Valider 65893012 rédigé par Charles Thomas's avatar Charles Thomas
Parcourir les fichiers

pushing template

parent 6418df09
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
# Idea files and dirs #
#######################
*.iml
**/out/
**/.idea/
*.class
# latex compilation files #
###########################
*.aux
*.log
*.dvi
*.synctex.gz
# OS generated files #
######################
.DS_Store
.DS_Store?
._*
# Python cache #
################
**/__pycache__/
*.pyc
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from .gspan import gSpan
from .graphdatabase import GraphDatabase
"""This package contains the gSpan implementation along with classes used to represent graphs and graph databases.
WARNING: The content of this package should be left unchanged."""
"""Definitions of Edge, Vertex and Graph."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import itertools
VACANT_EDGE_ID = -1
VACANT_VERTEX_ID = -1
VACANT_EDGE_LABEL = -1
VACANT_VERTEX_LABEL = -1
VACANT_GRAPH_ID = -1
AUTO_EDGE_ID = -1
class Edge(object):
"""Edge class."""
def __init__(self,
eid=VACANT_EDGE_ID,
frm=VACANT_VERTEX_ID,
to=VACANT_VERTEX_ID,
elb=VACANT_EDGE_LABEL):
"""Initialize Edge instance.
Args:
eid: edge id.
frm: source vertex id.
to: destination vertex id.
elb: edge label.
"""
self.eid = eid
self.frm = frm
self.to = to
self.elb = elb
class Vertex(object):
"""Vertex class."""
def __init__(self,
vid=VACANT_VERTEX_ID,
vlb=VACANT_VERTEX_LABEL):
"""Initialize Vertex instance.
Args:
vid: id of this vertex.
vlb: label of this vertex.
"""
self.vid = vid
self.vlb = vlb
self.edges = dict()
def add_edge(self, eid, frm, to, elb):
"""Add an outgoing edge."""
self.edges[to] = Edge(eid, frm, to, elb)
class Graph(object):
"""Graph class."""
def __init__(self,
gid=VACANT_GRAPH_ID,
is_undirected=True,
eid_auto_increment=True):
"""Initialize Graph instance.
Args:
gid: id of this graph.
is_undirected: whether this graph is directed or not.
eid_auto_increment: whether to increment edge ids automatically.
"""
self.gid = gid
self.is_undirected = is_undirected
self.vertices = dict()
self.set_of_elb = collections.defaultdict(set)
self.set_of_vlb = collections.defaultdict(set)
self.eid_auto_increment = eid_auto_increment
self.counter = itertools.count()
def get_num_vertices(self):
"""Return number of vertices in the graph."""
return len(self.vertices)
def add_vertex(self, vid, vlb):
"""Add a vertex to the graph."""
if vid in self.vertices:
return self
self.vertices[vid] = Vertex(vid, vlb)
self.set_of_vlb[vlb].add(vid)
return self
def add_edge(self, eid, frm, to, elb):
"""Add an edge to the graph."""
if (frm is self.vertices and
to in self.vertices and
to in self.vertices[frm].edges):
return self
if self.eid_auto_increment:
eid = next(self.counter)
self.vertices[frm].add_edge(eid, frm, to, elb)
self.set_of_elb[elb].add((frm, to))
if self.is_undirected:
self.vertices[to].add_edge(eid, to, frm, elb)
self.set_of_elb[elb].add((to, frm))
return self
def display(self):
"""Display the graph as text."""
display_str = ''
print('t # {}'.format(self.gid))
for vid in self.vertices:
print('v {} {}'.format(vid, self.vertices[vid].vlb))
display_str += 'v {} {} '.format(vid, self.vertices[vid].vlb)
for frm in self.vertices:
edges = self.vertices[frm].edges
for to in edges:
if self.is_undirected:
if frm < to:
print('e {} {} {}'.format(frm, to, edges[to].elb))
display_str += 'e {} {} {} '.format(
frm, to, edges[to].elb)
else:
print('e {} {} {}'.format(frm, to, edges[to].elb))
display_str += 'e {} {} {}'.format(frm, to, edges[to].elb)
return display_str
def plot(self):
"""Visualize the graph."""
try:
import networkx as nx
import matplotlib.pyplot as plt
except Exception as e:
print('Can not plot graph: {}'.format(e))
return
gnx = nx.Graph() if self.is_undirected else nx.DiGraph()
vlbs = {vid: v.vlb for vid, v in self.vertices.items()}
elbs = {}
for vid, v in self.vertices.items():
gnx.add_node(vid, label=v.vlb)
for vid, v in self.vertices.items():
for to, e in v.edges.items():
if (not self.is_undirected) or vid < to:
gnx.add_edge(vid, to, label=e.elb)
elbs[(vid, to)] = e.elb
fsize = (min(16, 1 * len(self.vertices)),
min(16, 1 * len(self.vertices)))
plt.figure(3, figsize=fsize)
pos = nx.spectral_layout(gnx)
nx.draw_networkx(gnx, pos, arrows=True, with_labels=True, labels=vlbs)
nx.draw_networkx_edge_labels(gnx, pos, edge_labels=elbs)
plt.show()
"""Implementation of a gSpan graph database with two classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
from .graph import AUTO_EDGE_ID
from .graph import Graph
class GraphDatabase(object):
def __init__(self):
self._graph_cnt = 0
self._graphs = dict()
def read_graphs(self, filename):
indexes = []
with codecs.open(filename, 'r', 'utf-8') as f:
lines = [line.strip() for line in f.readlines()]
tgraph = None
for i, line in enumerate(lines):
cols = line.split(' ')
if cols[0] == 't':
if tgraph is not None:
self._graphs[self._graph_cnt] = tgraph
indexes.append(self._graph_cnt)
self._graph_cnt += 1
tgraph = None
if cols[-1] == '-1':
break
tgraph = Graph(self._graph_cnt,
is_undirected=True,
eid_auto_increment=True)
elif cols[0] == 'v':
tgraph.add_vertex(cols[1], cols[2])
elif cols[0] == 'e':
tgraph.add_edge(AUTO_EDGE_ID, cols[1], cols[2], cols[3])
# adapt to input files that do not end with 't # -1'
if tgraph is not None:
self._graphs[self._graph_cnt] = tgraph
indexes.append(self._graph_cnt)
return indexes
"""Implementation of gSpan."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import copy
import itertools
import time
from .graph import AUTO_EDGE_ID
from .graph import Graph
from .graph import VACANT_GRAPH_ID
from .graph import VACANT_VERTEX_LABEL
def record_timestamp(func):
"""Record timestamp before and after call of `func`."""
def deco(self):
self.timestamps[func.__name__ + '_in'] = time.time()
func(self)
self.timestamps[func.__name__ + '_out'] = time.time()
return deco
class DFSedge(object):
"""DFSedge class."""
def __init__(self, frm, to, vevlb):
"""Initialize DFSedge instance."""
self.frm = frm
self.to = to
self.vevlb = vevlb
def __eq__(self, other):
"""Check equivalence of DFSedge."""
return (self.frm == other.frm and
self.to == other.to and
self.vevlb == other.vevlb)
def __ne__(self, other):
"""Check if not equal."""
return not self.__eq__(other)
def __repr__(self):
"""Represent DFScode in string way."""
return '(frm={}, to={}, vevlb={})'.format(
self.frm, self.to, self.vevlb
)
class DFScode(list):
"""DFScode is a list of DFSedge."""
def __init__(self):
"""Initialize DFScode."""
super().__init__()
self.rmpath = list()
def __eq__(self, other):
"""Check equivalence of DFScode."""
la, lb = len(self), len(other)
if la != lb:
return False
for i in range(la):
if self[i] != other[i]:
return False
return True
def __ne__(self, other):
"""Check if not equal."""
return not self.__eq__(other)
def __repr__(self):
"""Represent DFScode in string way."""
return ''.join(['[', ','.join(
[str(dfsedge) for dfsedge in self]), ']']
)
def push_back(self, frm, to, vevlb):
"""Update DFScode by adding one edge."""
self.append(DFSedge(frm, to, vevlb))
return self
def to_graph(self, gid=VACANT_GRAPH_ID, is_undirected=True):
"""Construct a graph according to the dfs code."""
g = Graph(gid,
is_undirected=is_undirected,
eid_auto_increment=True)
for dfsedge in self:
frm, to, (vlb1, elb, vlb2) = dfsedge.frm, dfsedge.to, dfsedge.vevlb
if vlb1 != VACANT_VERTEX_LABEL:
g.add_vertex(frm, vlb1)
if vlb2 != VACANT_VERTEX_LABEL:
g.add_vertex(to, vlb2)
g.add_edge(AUTO_EDGE_ID, frm, to, elb)
return g
def from_graph(self, g):
"""Build DFScode from graph `g`."""
raise NotImplementedError('Not inplemented yet.')
def build_rmpath(self):
"""Build right most path."""
self.rmpath = list()
old_frm = None
for i in range(len(self) - 1, -1, -1):
dfsedge = self[i]
frm, to = dfsedge.frm, dfsedge.to
if frm < to and (old_frm is None or to == old_frm):
self.rmpath.append(i)
old_frm = frm
return self
def get_num_vertices(self):
"""Return number of vertices in the corresponding graph."""
return len(set(
[dfsedge.frm for dfsedge in self] +
[dfsedge.to for dfsedge in self]
))
class PDFS(object):
"""PDFS class."""
def __init__(self, gid=VACANT_GRAPH_ID, edge=None, prev=None):
"""Initialize PDFS instance."""
self.gid = gid
self.edge = edge
self.prev = prev
class Projected(list):
"""Projected is a list of PDFS.
Each element of Projected is a projection one frequent graph in one
original graph.
"""
def __init__(self):
"""Initialize Projected instance."""
super(Projected, self).__init__()
def push_back(self, gid, edge, prev):
"""Update this Projected instance."""
self.append(PDFS(gid, edge, prev))
return self
class History(object):
"""History class."""
def __init__(self, g, pdfs):
"""Initialize History instance."""
super(History, self).__init__()
self.edges = list()
self.vertices_used = collections.defaultdict(int)
self.edges_used = collections.defaultdict(int)
if pdfs is None:
return
while pdfs:
e = pdfs.edge
self.edges.append(e)
(self.vertices_used[e.frm],
self.vertices_used[e.to],
self.edges_used[e.eid]) = 1, 1, 1
pdfs = pdfs.prev
self.edges = self.edges[::-1]
def has_vertex(self, vid):
"""Check if the vertex with vid exists in the history."""
return self.vertices_used[vid] == 1
def has_edge(self, eid):
"""Check if the edge with eid exists in the history."""
return self.edges_used[eid] == 1
class gSpan(object):
"""`gSpan` algorithm."""
def __init__(self,
task,
min_num_vertices=1,
max_num_vertices=float('inf'),
is_undirected=True,
verbose=False,
visualize=False,
where=False):
"""Initialize gSpan instance."""
self._is_undirected = is_undirected
self._task = task
self._database = task.database
self._min_num_vertices = min_num_vertices
self._max_num_vertices = max_num_vertices
self._DFScode = DFScode()
self._support = 0
self._frequent_size1_subgraphs = list()
# Include subgraphs with
# any num(but >= 2, <= max_num_vertices) of vertices.
self._counter = itertools.count()
self._verbose = verbose
self._visualize = visualize
self._where = where
self.timestamps = dict()
if self._max_num_vertices < self._min_num_vertices:
print('Max number of vertices can not be smaller than '
'min number of that.\n'
'Set max_num_vertices = min_num_vertices.')
self._max_num_vertices = self._min_num_vertices
def time_stats(self):
"""Print stats of time."""
func_names = ['run']
time_deltas = collections.defaultdict(float)
for fn in func_names:
time_deltas[fn] = round(
self.timestamps[fn + '_out'] - self.timestamps[fn + '_in'],
2
)
print('Total:\t{} s'.format(time_deltas['run']))
return self
def _get_gid_subsets(self, projected):
subsets = [[] for _ in self._task.gid_subsets]
gids = set([g.gid for g in projected])
for gid in gids:
subsets[self._gid_subset_ids[gid]].append(gid)
return subsets
@record_timestamp
def run(self):
"""Run the gSpan algorithm."""
root = collections.defaultdict(Projected)
gids = set([gid for gid_subset in self._task.gid_subsets for gid in gid_subset])
self._gid_subset_ids = {}
for i, gid_subset in enumerate(self._task.gid_subsets):
for gid in gid_subset:
self._gid_subset_ids[gid] = i
for gid in gids:
g = self._database._graphs[gid]
for vid, v in g.vertices.items():
edges = self._get_forward_root_edges(g, vid)
for e in edges:
root[(v.vlb, e.elb, g.vertices[e.to].vlb)].append(
PDFS(gid, e, None)
)
for vevlb, projected in root.items():
self._DFScode.append(DFSedge(0, 1, vevlb))
self._subgraph_mining(projected)
self._DFScode.pop()
def _report(self, projected):
self._frequent_subgraphs.append(copy.copy(self._DFScode))
if self._DFScode.get_num_vertices() < self._min_num_vertices:
return
g = self._DFScode.to_graph(gid=next(self._counter),
is_undirected=self._is_undirected)
display_str = g.display()
print('\nSupport: {}'.format(self._support))
if self._visualize:
g.plot()
if self._where:
print('where: {}'.format(list(set([p.gid for p in projected]))))
print('\n-----------------\n')
def print_results(self):
for i, subgraph in enumerate(self._frequent_subgraphs):
g = subgraph.to_graph(gid=next(self._counter),
is_undirected=self._is_undirected)
g.display()
print(self._subgraph_occurrences[i])
def _get_forward_root_edges(self, g, frm):
result = []
v_frm = g.vertices[frm]
for to, e in v_frm.edges.items():
if (not self._is_undirected) or v_frm.vlb <= g.vertices[to].vlb:
result.append(e)
return result
def _get_backward_edge(self, g, e1, e2, history):
if self._is_undirected and e1 == e2:
return None
for to, e in g.vertices[e2.to].edges.items():
if history.has_edge(e.eid) or e.to != e1.frm:
continue
# if reture here, then self._DFScodep[0] != dfs_code_min[0]
# should be checked in _is_min(). or:
if self._is_undirected:
if e1.elb < e.elb or (
e1.elb == e.elb and
g.vertices[e1.to].vlb <= g.vertices[e2.to].vlb):
return e
else:
if g.vertices[e1.frm].vlb < g.vertices[e2.to] or (
g.vertices[e1.frm].vlb == g.vertices[e2.to] and
e1.elb <= e.elb):
return e
# if e1.elb < e.elb or (e1.elb == e.elb and
# g.vertices[e1.to].vlb <= g.vertices[e2.to].vlb):
# return e
return None
def _get_forward_pure_edges(self, g, rm_edge, min_vlb, history):
result = []
for to, e in g.vertices[rm_edge.to].edges.items():
if min_vlb <= g.vertices[e.to].vlb and (
not history.has_vertex(e.to)):
result.append(e)
return result
def _get_forward_rmpath_edges(self, g, rm_edge, min_vlb, history):
result = []
to_vlb = g.vertices[rm_edge.to].vlb
for to, e in g.vertices[rm_edge.frm].edges.items():
new_to_vlb = g.vertices[to].vlb
if (rm_edge.to == e.to or
min_vlb > new_to_vlb or
history.has_vertex(e.to)):
continue
if rm_edge.elb < e.elb or (rm_edge.elb == e.elb and
to_vlb <= new_to_vlb):
result.append(e)
return result
def _is_min(self):
if self._verbose:
print('is_min: checking {}'.format(self._DFScode))
if len(self._DFScode) == 1:
return True
g = self._DFScode.to_graph(gid=VACANT_GRAPH_ID,
is_undirected=self._is_undirected)
dfs_code_min = DFScode()
root = collections.defaultdict(Projected)
for vid, v in g.vertices.items():
edges = self._get_forward_root_edges(g, vid)
for e in edges:
root[(v.vlb, e.elb, g.vertices[e.to].vlb)].append(
PDFS(g.gid, e, None))
min_vevlb = min(root.keys())
dfs_code_min.append(DFSedge(0, 1, min_vevlb))
# No need to check if is min code because of pruning in get_*_edge*.
def project_is_min(projected):
dfs_code_min.build_rmpath()
rmpath = dfs_code_min.rmpath
min_vlb = dfs_code_min[0].vevlb[0]
maxtoc = dfs_code_min[rmpath[0]].to
backward_root = collections.defaultdict(Projected)
flag, newto = False, 0,
end = 0 if self._is_undirected else -1
for i in range(len(rmpath) - 1, end, -1):
if flag:
break
for p in projected:
history = History(g, p)
e = self._get_backward_edge(g,
history.edges[rmpath[i]],
history.edges[rmpath[0]],
history)
if e is not None:
backward_root[e.elb].append(PDFS(g.gid, e, p))
newto = dfs_code_min[rmpath[i]].frm
flag = True
if flag:
backward_min_elb = min(backward_root.keys())
dfs_code_min.append(DFSedge(
maxtoc, newto,
(VACANT_VERTEX_LABEL,
backward_min_elb,
VACANT_VERTEX_LABEL)
))
idx = len(dfs_code_min) - 1
if self._DFScode[idx] != dfs_code_min[idx]:
return False
return project_is_min(backward_root[backward_min_elb])
forward_root = collections.defaultdict(Projected)
flag, newfrm = False, 0
for p in projected:
history = History(g, p)
edges = self._get_forward_pure_edges(g,
history.edges[rmpath[0]],
min_vlb,
history)
if len(edges) > 0:
flag = True
newfrm = maxtoc
for e in edges:
forward_root[
(e.elb, g.vertices[e.to].vlb)
].append(PDFS(g.gid, e, p))
for rmpath_i in rmpath:
if flag:
break
for p in projected:
history = History(g, p)
edges = self._get_forward_rmpath_edges(g,
history.edges[
rmpath_i],
min_vlb,
history)
if len(edges) > 0:
flag = True
newfrm = dfs_code_min[rmpath_i].frm
for e in edges:
forward_root[
(e.elb, g.vertices[e.to].vlb)
].append(PDFS(g.gid, e, p))
if not flag:
return True
forward_min_evlb = min(forward_root.keys())
dfs_code_min.append(DFSedge(
newfrm, maxtoc + 1,
(VACANT_VERTEX_LABEL, forward_min_evlb[0], forward_min_evlb[1]))
)
idx = len(dfs_code_min) - 1
if self._DFScode[idx] != dfs_code_min[idx]:
return False
return project_is_min(forward_root[forward_min_evlb])
res = project_is_min(root[min_vevlb])
return res
def _subgraph_mining(self, projected):
gid_subsets = self._get_gid_subsets(projected)
if self._task.prune(gid_subsets):
return
if not self._is_min():
return
self._task.store(repr(self._DFScode), gid_subsets)
num_vertices = self._DFScode.get_num_vertices()
self._DFScode.build_rmpath()
rmpath = self._DFScode.rmpath
maxtoc = self._DFScode[rmpath[0]].to
min_vlb = self._DFScode[0].vevlb[0]
forward_root = collections.defaultdict(Projected)
backward_root = collections.defaultdict(Projected)
for p in projected:
g = self._database._graphs[p.gid]
history = History(g, p)
# backward
for rmpath_i in rmpath[::-1]:
e = self._get_backward_edge(g,
history.edges[rmpath_i],
history.edges[rmpath[0]],
history)
if e is not None:
backward_root[
(self._DFScode[rmpath_i].frm, e.elb)
].append(PDFS(g.gid, e, p))
# pure forward
if num_vertices >= self._max_num_vertices:
continue
edges = self._get_forward_pure_edges(g,
history.edges[rmpath[0]],
min_vlb,
history)
for e in edges:
forward_root[
(maxtoc, e.elb, g.vertices[e.to].vlb)
].append(PDFS(g.gid, e, p))
# rmpath forward
for rmpath_i in rmpath:
edges = self._get_forward_rmpath_edges(g,
history.edges[rmpath_i],
min_vlb,
history)
for e in edges:
forward_root[
(self._DFScode[rmpath_i].frm,
e.elb, g.vertices[e.to].vlb)
].append(PDFS(g.gid, e, p))
# backward
for to, elb in backward_root:
self._DFScode.append(DFSedge(
maxtoc, to,
(VACANT_VERTEX_LABEL, elb, VACANT_VERTEX_LABEL))
)
self._subgraph_mining(backward_root[(to, elb)])
self._DFScode.pop()
# forward
# No need to check if num_vertices >= self._max_num_vertices.
# Because forward_root has no element.
for frm, elb, vlb2 in forward_root:
self._DFScode.append(DFSedge(
frm, maxtoc + 1,
(VACANT_VERTEX_LABEL, elb, vlb2))
)
self._subgraph_mining(forward_root[(frm, elb, vlb2)])
self._DFScode.pop()
return self
"""The main program that runs gSpan. Two examples are provided"""
# -*- coding=utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import numpy
from sklearn import naive_bayes
from sklearn import metrics
from gspan_mining import gSpan
from gspan_mining import GraphDatabase
class PatternGraphs:
"""
This template class is used to define a task for the gSpan implementation.
You should not modify this class but extend it to define new tasks
"""
def __init__(self, database):
# A list of subsets of graph identifiers.
# Is used to specify different groups of graphs (classes and training/test sets).
# The gid-subsets parameter in the pruning and store function will contain for each subset, all the occurrences
# in which the examined pattern is present.
self.gid_subsets = []
self.database = database # A graphdatabase instance: contains the data for the problem.
def store(self, dfs_code, gid_subsets):
"""
Code to be executed to store the pattern, if desired.
The function will only be called for patterns that have not been pruned.
In correlated pattern mining, we may prune based on confidence, but then check further conditions before storing.
:param dfs_code: the dfs code of the pattern (as a string).
:param gid_subsets: the cover (set of graph ids in which the pattern is present) for each subset in self.gid_subsets
"""
print("Please implement the store function in a subclass for a specific mining task!")
def prune(self, gid_subsets):
"""
prune function: used by the gSpan algorithm to know if a pattern (and its children in the search tree)
should be pruned.
:param gid_subsets: A list of the cover of the pattern for each subset.
:return: true if the pattern should be pruned, false otherwise.
"""
print("Please implement the prune function in a subclass for a specific mining task!")
class FrequentPositiveGraphs(PatternGraphs):
"""
Finds the frequent (support >= minsup) subgraphs among the positive graphs.
This class provides a method to build a feature matrix for each subset.
"""
def __init__(self, minsup, database, subsets):
"""
Initialize the task.
:param minsup: the minimum positive support
:param database: the graph database
:param subsets: the subsets (train and/or test sets for positive and negative class) of graph ids.
"""
super().__init__(database)
self.patterns = [] # The patterns found in the end (as dfs codes represented by strings) with their cover (as a list of graph ids).
self.minsup = minsup
self.gid_subsets = subsets
# Stores any pattern found that has not been pruned
def store(self, dfs_code, gid_subsets):
self.patterns.append((dfs_code, gid_subsets))
# Prunes any pattern that is not frequent in the positive class
def prune(self, gid_subsets):
# first subset is the set of positive ids
return len(gid_subsets[0]) < self.minsup
# creates a column for a feature matrix
def create_fm_col(self, all_gids, subset_gids):
subset_gids = set(subset_gids)
bools = []
for i, val in enumerate(all_gids):
if val in subset_gids:
bools.append(1)
else:
bools.append(0)
return bools
# return a feature matrix for each subset of examples, in which the columns correspond to patterns
# and the rows to examples in the subset.
def get_feature_matrices(self):
matrices = [[] for _ in self.gid_subsets]
for pattern, gid_subsets in self.patterns:
for i, gid_subset in enumerate(gid_subsets):
matrices[i].append(self.create_fm_col(self.gid_subsets[i], gid_subset))
return [numpy.array(matrix).transpose() for matrix in matrices]
def example1():
"""
Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
with a minimum positive support of minsup and prints them.
"""
args = sys.argv
database_file_name_pos = args[1] # First parameter: path to positive class file
database_file_name_neg = args[2] # Second parameter: path to negative class file
minsup = int(args[3]) # Third parameter: minimum support
if not os.path.exists(database_file_name_pos):
print('{} does not exist.'.format(database_file_name_pos))
sys.exit()
if not os.path.exists(database_file_name_neg):
print('{} does not exist.'.format(database_file_name_neg))
sys.exit()
graph_database = GraphDatabase() # Graph database object
pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading positive graphs, adding them to database and getting ids
neg_ids = graph_database.read_graphs(database_file_name_neg) # Reading negative graphs, adding them to database and getting ids
subsets = [pos_ids, neg_ids] # The ids for the positive and negative labelled graphs in the database
task = FrequentPositiveGraphs(minsup, graph_database, subsets) # Creating task
gSpan(task).run() # Running gSpan
# Printing frequent patterns along with their positive support:
for pattern, gid_subsets in task.patterns:
pos_support = len(gid_subsets[0]) # This will have to be replaced by the confidence and support on both classes
print('{} {}'.format(pattern, pos_support))
def example2():
"""
Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of
the positive class with a minimum support of minsup.
Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on
the test set.
Performs a k-fold cross-validation.
"""
args = sys.argv
database_file_name_pos = args[1] # First parameter: path to positive class file
database_file_name_neg = args[2] # Second parameter: path to negative class file
minsup = int(args[3]) # Third parameter: minimum support (note: this parameter will be k in case of top-k mining)
nfolds = int(args[4]) # Fourth parameter: number of folds to use in the k-fold cross-validation.
if not os.path.exists(database_file_name_pos):
print('{} does not exist.'.format(database_file_name_pos))
sys.exit()
if not os.path.exists(database_file_name_neg):
print('{} does not exist.'.format(database_file_name_neg))
sys.exit()
graph_database = GraphDatabase() # Graph database object
pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading positive graphs, adding them to database and getting ids
neg_ids = graph_database.read_graphs(database_file_name_neg) # Reading negative graphs, adding them to database and getting ids
# If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!)
if nfolds < 2:
subsets = [
pos_ids, # Positive training set
pos_ids, # Positive test set
neg_ids, # Negative training set
neg_ids # Negative test set
]
# Printing fold number:
print('fold {}'.format(1))
train_and_evaluate(minsup, graph_database, subsets)
# Otherwise: performs k-fold cross-validation:
else:
pos_fold_size = len(pos_ids) // nfolds
neg_fold_size = len(neg_ids) // nfolds
for i in range(nfolds):
# Use fold as test set, the others as training set for each class;
# identify all the subsets to be maintained by the graph mining algorithm.
subsets = [
numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set
pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set
numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set
neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set
]
# Printing fold number:
print('fold {}'.format(i+1))
train_and_evaluate(minsup, graph_database, subsets)
def train_and_evaluate(minsup, database, subsets):
task = FrequentPositiveGraphs(minsup, database, subsets) # Creating task
gSpan(task).run() # Running gSpan
# Creating feature matrices for training and testing:
features = task.get_feature_matrices()
train_fm = numpy.concatenate((features[0], features[2])) # Training feature matrix
train_labels = numpy.concatenate((numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels
test_fm = numpy.concatenate((features[1], features[3])) # Testing feature matrix
test_labels = numpy.concatenate((numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels
classifier = naive_bayes.GaussianNB() # Creating model object
classifier.fit(train_fm, train_labels) # Training model
predicted = classifier.predict(test_fm) # Using model to predict labels of testing data
accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy:
# Printing frequent patterns along with their positive support:
for pattern, gid_subsets in task.patterns:
pos_support = len(gid_subsets[0])
print('{} {}'.format(pattern, pos_support))
# printing classification results:
print(predicted)
print('accuracy: {}'.format(accuracy))
print() # Blank line to indicate end of fold.
if __name__ == '__main__':
example1()
# example2()
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter