4.1 百度飞桨GCN代码解析

1. 引入数据

  1. 从库中读取数据
  2. 正则化数据
import pgl
from pgl import data_loader
import paddle.fluid as fluid
import numpy as np
import time
import argparse

def normalize(feat):
    return feat / np.maximum(np.sum(feat, -1, keepdims=True), 1)

def load(name, normalized_feature=True):
    if name == 'cora':
        dataset = data_loader.CoraDataset()
    elif name == "pubmed":
        dataset = data_loader.CitationDataset("pubmed", symmetry_edges=True)
    elif name == "citeseer":
        dataset = data_loader.CitationDataset("citeseer", symmetry_edges=True)
    else:
        raise ValueError(name + " dataset doesn't exists")

    indegree = dataset.graph.indegree()
    # note 1
    norm = np.maximum(indegree.astype("float32"), 1)
    norm = np.power(norm, -0.5)
    # note 2
    dataset.graph.node_feat["norm"] = np.expand_dims(norm, -1)
    dataset.graph.node_feat["words"] = normalize(dataset.graph.node_feat["words"])
    return dataset

def main(args):
    dataset = load("cora")

	# normalize
    indegree = dataset.graph.indegree()
    norm = np.zeros_like(indegree, dtype="float32")
    norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
    dataset.graph.node_feat["norm"] = np.expand_dims(norm, -1)
    print("norm result saved in graph.norm")
    print(np.expand_dims(norm, -1))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='GCN')
    parser.add_argument(
        "--epoch", type=int, default=200, help="Epochs")
    parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
    parser.add_argument("--use_demo_gat", action='store_true', help="use_demo_gat")
    parser.add_argument("--learning_rate", type=float, default=0.01, help="learning rate")
    parser.add_argument("--hidden_size", type=int, default=16, help="hidden size")
    parser.add_argument("--dropout", type=float, default=0.5, help="dropout")
    args = parser.parse_args()
    main(args)

正常情况下,如果在note1出打印下列信息

	print("indegree: {0}".format(indegree))
    print('图中节点的向量表示,name:{0}, shape:{1}, dtype:{2}'.format(
        dataset.graph.node_feat_info()[0][0],
        dataset.graph.node_feat_info()[0][1],
        dataset.graph.node_feat_info()[0][2])
    )
    print('图中边的向量表示,name:{0}, shape:{1}, dtype:{2}'.format(
        dataset.graph.edge_feat_info()[0][0],
        dataset.graph.edge_feat_info()[0][1],
        dataset.graph.edge_feat_info()[0][2])
    )

但是比较啃爹的是,在data_loader.py这个函数中,读取函数用的是这个

跟人家要的不太一样:

所以输出就变成了

note1与note2之间的代码其实是在计算

正则化部分


这里应该是写重复了,都能够达到一样的效果,其中1,3是一样的处理方法。

3. 定义数据

PS:我没写反。我没写反。我没写反。就是这个静态图让我有点混乱。

feed_dict = gw.to_feed(dataset.graph)

    train_index = dataset.train_index
    train_label = np.expand_dims(dataset.y[train_index], -1)
    train_index = np.expand_dims(train_index, -1)

    val_index = dataset.val_index
    val_label = np.expand_dims(dataset.y[val_index], -1)
    val_index = np.expand_dims(val_index, -1)

    test_index = dataset.test_index
    test_label = np.expand_dims(dataset.y[test_index], -1)
    test_index = np.expand_dims(test_index, -1)

2. 定义model and data

2.1 定义数据读取方式

 ################# 定义数据 #########################
    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
    train_program = fluid.Program()
    startup_program = fluid.Program()
    test_program = fluid.Program()
    hidden_size = args.hidden_size

    with fluid.program_guard(train_program, startup_program):
        gw = pgl.graph_wrapper.GraphWrapper(
            name="graph",
            place=place,
            node_feat=dataset.graph.node_feat_info())

我们看一看GraphWrapper中长啥样:


class GraphWrapper(BaseGraphWrapper):
    """Implement a graph wrapper that creates a graph data holders that attributes and features in the graph are :code:`L.data`. And we provide interface :code:`to_feed` to help converting :code:`Graph` data into :code:`feed_dict`. Args: name: The graph data prefix node_feat: A list of tuples that decribe the details of node feature tenosr. Each tuple mush be (name, shape, dtype) and the first dimension of the shape must be set unknown (-1 or None) or we can easily use :code:`Graph.node_feat_info()` to get the node_feat settings. edge_feat: A list of tuples that decribe the details of edge feature tenosr. Each tuple mush be (name, shape, dtype) and the first dimension of the shape must be set unknown (-1 or None) or we can easily use :code:`Graph.edge_feat_info()` to get the edge_feat settings. Examples: .. code-block:: python import numpy as np import paddle.fluid as fluid from pgl.graph import Graph from pgl.graph_wrapper import GraphWrapper place = fluid.CPUPlace() exe = fluid.Excecutor(place) num_nodes = 5 edges = [ (0, 1), (1, 2), (3, 4)] feature = np.random.randn(5, 100) edge_feature = np.random.randn(3, 100) graph = Graph(num_nodes=num_nodes, edges=edges, node_feat={ "feature": feature }, edge_feat={ "edge_feature": edge_feature }) graph_wrapper = GraphWrapper(name="graph", node_feat=graph.node_feat_info(), edge_feat=graph.edge_feat_info()) # build your deep graph model ... # Initialize parameters for deep graph model exe.run(fluid.default_startup_program()) for i in range(10): feed_dict = graph_wrapper.to_feed(graph) ret = exe.run(fetch_list=[...], feed=feed_dict ) """

    def __init__(self, name, node_feat=[], edge_feat=[], **kwargs):
        super(GraphWrapper, self).__init__()
        # collect holders for PyReader
        self._data_name_prefix = name
        self._holder_list = []
        self.__create_graph_attr_holders()
        for node_feat_name, node_feat_shape, node_feat_dtype in node_feat:
            self.__create_graph_node_feat_holders(
                node_feat_name, node_feat_shape, node_feat_dtype)

        for edge_feat_name, edge_feat_shape, edge_feat_dtype in edge_feat:
            self.__create_graph_edge_feat_holders(
                edge_feat_name, edge_feat_shape, edge_feat_dtype)
	# ... 后面还有很多属于这个类的函数

可以看到,其edge的信息为一个空数组,传进去的只有node_feat

2.2 定义model

大体框架是:1层GCN+drop out+fully connect,然后feed一下node_index和node_label,用gather函数聚合(这个是个啥玩意后面会提)一下,就得到了pred函数,通过与node_label 进行二分类计算loss。

2.2.1 GCN函数

        output = gcn_layer(gw,
                           gw.node_feat["words"],
                           hidden_size,
                           activation="relu",
                           norm=gw.node_feat['norm'],
                           name="gcn_layer_1")
        output = fluid.layers.dropout(
            output, args.dropout, dropout_implementation='upscale_in_train')
        output = fluid.layers.fc(output,
                                 dataset.num_classes,
                                 name="final_fc")
        node_index = fluid.layers.data(
            "node_index",
            shape=[None, 1],
            dtype="int64",
            append_batch_size=False)
        node_label = fluid.layers.data(
            "node_label",
            shape=[None, 1],
            dtype="int64",
            append_batch_size=False)

        pred = fluid.layers.gather(output, node_index)
        loss, pred = fluid.layers.softmax_with_cross_entropy(
            logits=pred, label=node_label, return_softmax=True)
        acc = fluid.layers.accuracy(input=pred, label=node_label, k=1)
        loss = fluid.layers.mean(loss)
    with fluid.program_guard(train_program, startup_program):
         adam = fluid.optimizer.Adam(
            learning_rate=args.learning_rate,
            regularization=fluid.regularizer.L2DecayRegularizer(
                regularization_coeff=0.0005))
         adam.minimize(loss)

    exe = fluid.Executor(place)
    exe.run(startup_program)

其中函数gcn_layer则表示为:

def gcn_layer(gw, feature, hidden_size, activation, name, norm=None):

    # send函数
    def send_func(src_feat, dst_feat, edge_feat):
        ''' 请完成填空 提示: src_feat 为源节点特征 src_feat { "h": Tensor形状为 [边数目, hidden_size] } dst_feat 为目标节点特征 dst_feat { "h": Tensor形状为 [边数目, hidden_size] } 由于本题目没有边特征,edge_feat为 None '''
        # 问题1:下列两个feature,我们选择哪个

        ans1 = "A" # "A" or "B" 
        if ans1 == "A":
            feat = src_feat["h"]
        elif ans1 == "B":
            feat = dst_feat["h"]
        return feat

    # recv函数
    def recv_func(msg):
        ''' 请完成填空 提示: 1. 使用到的函数:fluid.layers.sequence_pool(x, pool_type) 2. 接受到的消息是一个变长Tensor,在Paddle里被称为LodTensor 例如: msg = [ [1, 2], # 节点0 接受的特征 [1], # 节点1 接受的特征 [2, 3, 4] # 节点2 接受的特征 ] 对于不定长Tensor,我们可以使用一系列的sequence操作。例如sequence_pool 例如: 对上述msg进行sequence_pool求和的操作, 我们会得到 msg = [ [3], # 节点0 接受的特征 [1], # 节点1 接受的特征 [9] # 节点2 接受的特征 ] '''
        # 问题2:在 GCN里面,我们的 Recv 函数是
        
        ans = "A"  # "A" or "B" or "C"
        if ans == "A":
            return fluid.layers.sequence_pool(msg, "sum")
        elif ans == "B":
            return fluid.layers.sequence_pool(msg, "average")
        elif ans == "C":
            return fluid.layers.sequence_pool(msg, "max")
        
    # 消息传递机制执行过程
    msg = gw.send(send_func, nfeat_list=[("h", feature)]) 
    output = gw.recv(msg, recv_func)

    # 通过以activation为激活函数的全连接输出层
    output = fluid.layers.fc(output,
                            size=hidden_size,
                            bias_attr=False,
                            act=activation,
                            name=name)
    return output

通过调用函数send发送消息,这个函数其实是一层鸡肋,其主要功能是检查你要send的数据格式,也就是nfeat_listefeat_list,只是efeat_list在这段代码中没有用,换句话说,这里还只是基于点特征的GCN,并没有加上边的特征。检查完格式之后,把node的信息按照key: value 的格式存好,这也是为什么**在函数gcn_layer**中src_feat["h"]的key是h的原因。然后使用recv函数去接收send函数发送的节点信息,recv完了之后,放到一个全连接层中,带上激活函数,这样的话一层图卷积就算完成了。
Question1: recv在接收到message之后,可以利用fluid.layers.sequence_pool(msg, "average")决定信息的处理方式,处理方式有哪些呢?
Q1:处理方式,百度提供了6种,如果要改的话,其实还挺麻烦的,因为他给加到静态图里去了,后面就看不懂了T T

Question2: send函数发送哪几个节点的信息呢?为什么需要用input_node 的index呢?输出的之后为啥不直接是输入node的个数呢?这一段,得看一下数据feed进来的size

根据上面feed 的数据大小来看,train data 也就是gw.node_feat[words]的shape 是(2708, 1433),一口气全部吃进去了,然后过一个hidden size=16的fc layer->(2708, 16),经过一个dropout层->(2708, 16),最后过一个num_class的fc layer->(2708, num_classes),进入到gather层中聚合一下(从这2708个点中选出train_data 对应的index),聚合的结果就可以扔到softmax中进行分类了。


关于边特征efeat_list做一点补充:这里默认为None,那是因为在GraphWrapper中,类初始化的时候有这样一个函数:

这个逼又调用了这个逼

然后,在send函数中,如果efeat_list是你传进去的话,百度甩出一个warnning来说:因为节点发生变化了,所以对应的边也变化了,如果你不知道怎么改的话,就别瞎几把动。
"The edge features in argument efeat_list should be fetched "
"from a instance of pgl.graph_wrapper.GraphWrapper, "
“because we have sorted the edges and the order of edges is changed.\n”
"Therefore, if you use external edge features, "
"the order of features of each edge may not match its edge, "
“which can cause serious errors.\n”
“If you use the efeat_list correctly, please ignore this warning.”

其中send函数在基类BaseGraphWrapper中:

    def send(self, message_func, nfeat_list=None, efeat_list=None):
        """Send message from all src nodes to dst nodes. The UDF message function should has the following format. .. code-block:: python def message_func(src_feat, dst_feat, edge_feat): ''' Args: src_feat: the node feat dict attached to the src nodes. dst_feat: the node feat dict attached to the dst nodes. edge_feat: the edge feat dict attached to the corresponding (src, dst) edges. Return: It should return a tensor or a dictionary of tensor. And each tensor should have a shape of (num_edges, dims). ''' pass Args: message_func: UDF function. nfeat_list: a list of names or tuple (name, tensor) efeat_list: a list of names or tuple (name, tensor) Return: A dictionary of tensor representing the message. Each of the values in the dictionary has a shape (num_edges, dim) which should be collected by :code:`recv` function. """
        if efeat_list is None:
            efeat_list = {
   }
        else:
            warnings.warn(
                "The edge features in argument `efeat_list` should be fetched "
                "from a instance of `pgl.graph_wrapper.GraphWrapper`, "
                "because we have sorted the edges and the order of edges is changed.\n"
                "Therefore, if you use external edge features, "
                "the order of features of each edge may not match its edge, "
                "which can cause serious errors.\n"
                "If you use the `efeat_list` correctly, please ignore this warning."
            )

        if nfeat_list is None:
            nfeat_list = {
   }

        src, dst = self.edges
        nfeat = {
   }

        for feat in nfeat_list:
            if isinstance(feat, str):
                nfeat[feat] = self.node_feat[feat]
            else:
                name, tensor = feat
                nfeat[name] = tensor

        efeat = {
   }
        for feat in efeat_list:
            if isinstance(feat, str):
                efeat[feat] = self.edge_feat[feat]
            else:
                name, tensor = feat
                efeat[name] = tensor

        msg = send(src, dst, nfeat, efeat, message_func)
        return msg

2.2.2 fluid.layers.gather(output, node_index)函数

这个函数的其实就是从output中把与input_node相同的index选出来作为input_node的输出结果。

def gather(input, index, overwrite=True):
    """ **Gather Layer** Output is obtained by gathering entries of the outer-most dimension of X indexed by `index` and concatenate them together. .. math:: Out = X[Index] .. code-block:: text Given: X = [[1, 2], [3, 4], [5, 6]] Index = [1, 2] Then: Out = [[3, 4], [5, 6]] Args: input (Variable): The source input tensor with rank>=1. Supported data type is int32, int64, float32, float64 and uint8 (only for CPU), float16 (only for GPU). index (Variable): The index input tensor with rank=1. Data type is int32 or int64. overwrite (bool, optional): The mode that updating the grad when has same index. If True, use the overwrite mode to update the grad of the same index, if False, use the accumulate mode to update the grad of the same index. Default value is True. Returns: output (Variable): The output is a tensor with the same rank as input. Examples: .. code-block:: python import paddle.fluid as fluid x = fluid.data(name='x', shape=[-1, 5], dtype='float32') index = fluid.data(name='index', shape=[-1, 1], dtype='int32') output = fluid.layers.gather(x, index) """