Sorry, you need to enable JavaScript to visit this website.

Feedback

Your feedback is important to keep improving our website and offer you a more reliable experience.

C for Metal Development Package

The Intel® C for Metal development package is a software development package for Intel® Graphics Technology. It includes the Intel® C for Metal Compiler, the Intel® C for Metal Runtime, Intel® Media Driver for VAAPI, and reference examples, which can be used to develop applications accelerated by Intel® Graphics Media Accelerator. A typical application contains two kinds of source code, kernel and host. The kernel is written in Intel® C for Media language, compiled to GPU ISA binary by the Intel® C for Metal Compiler, and executed on the GPU. Host manages workloads through the Intel® C for Metal Runtime and user mode media driver.

Kernel Programming: Register Usage

BY Li Huang ON Jun 13, 2019

Tutorial 11. Kernel Programming: Register Usage

Here we show three different algorithms for the seemingly simple linear filtering. All three algorithms use 2 1-d convolutions, horizontal then vertical, to implement the 2-d convolution. This approach minimizes computation, however it needs some more storage for intermediate results.

// change the algorithm from 2-d convolution to 2 1-d convolution.
// This change saves computation yet requires more registers.
// So it is a trade-off between time and space. On GEN, every EU-thread
// gets 128x32 bytes of regiser space. As long as kernels can stay within
// this limit, we should strive for computation efficiency.
extern "C" _GENX_MAIN_ void
linear1d2(SurfaceIndex ibuf, SurfaceIndex obuf)
{
    matrix<uchar, 8, 32> in;
    matrix<uchar, 6, 24> out;
    matrix<short, 8, 24> m;
    matrix<short, 6, 24> m_out;

    // when we use media-walker, we can get thread-ids
    // using the following intrinsic instead of using
    // per-thread arguments
    uint h_pos = get_thread_origin_x();
    uint v_pos = get_thread_origin_y();

    read(ibuf, h_pos*24, v_pos*6, in);

    // sum up the input pixel values by columns
    m = in.select<8,1,24,1>(0,0) + in.select<8,1,24,1>(0,3);
    m += in.select<8,1,24,1>(0,6);

    // sum up the m values by rows
    m_out = m.select<6,1,24,1>(0,0) + m.select<6,1,24,1>(1,0);
    m_out += m.select<6,1,24,1>(2,0);

    out = m_out * 0.111f;

    write(obuf, h_pos*24, v_pos*6, out);
}
// this version also use 2 1-d convolution to save computation.
// Unlike linear1d2, it uses a sliding window scheme to minimize
// the storage: 3 rows for both input and intermediate result,
// and one row for output. However, in this way, it loads input one
// row at a time, and store output one row at a time.
extern "C" _GENX_MAIN_ void
linearslide(SurfaceIndex ibuf, SurfaceIndex obuf)
{
    matrix<uchar, 3, 32> in;
    vector<uchar, 24> out;
    matrix<short, 3, 24> m;
    vector<short, 24> m_out;

    // when we use media-walker, we can get thread-ids
    // using the following intrinsic instead of using
    // per-thread arguments
    uint h_pos = get_thread_origin_x();
    uint v_pos = get_thread_origin_y();

    // reads the first 3 rows
    read(ibuf, h_pos*24, v_pos*6, in);

    // sum up the input pixel values by columns
    m = in.select<3,1,24,1>(0,0)
        + in.select<3,1,24,1>(0,3)
        + in.select<3,1,24,1>(0,6);

#pragma unroll
    for (int i = 0; i < 5; ++i) {
        // sum up the m values by rows
        m_out = m.row(0) + m.row(1) + m.row(2);
        out = m_out * 0.111f;
        // write out one row
        write(obuf, h_pos*24, v_pos*6+i, out);
        // read in the next row
        read(ibuf, h_pos*24, v_pos*6+i+3, in.row(i%3));
        // sum up pixels by columns
        m.row(i%3) = in.select<1,1,24,1>(i%3,0)
            + in.select<1,1,24,1>(i%3,3)
            + in.select<1,1,24,1>(i%3,6);
    }
    // sum up the m values by rows
    m_out = m.row(0) + m.row(1) + m.row(2);
    out = m_out * 0.111f;
    // write out the last row
    write(obuf, h_pos*24, v_pos*6+5, out);
}
// This is another version of using sliding window.
// However it only minimizes the intermediate result to 3 rows.
// It keeps the entire input block and output block in registers
// in order to utilize the large media-block read and write.
extern "C" _GENX_MAIN_ void
linearslide2(SurfaceIndex ibuf, SurfaceIndex obuf)
{
    matrix<uchar, 8, 32> in;
    matrix<uchar, 6, 24> out;
    matrix<short, 3, 24> m;
    vector<short, 24> m_out;

    // when we use media-walker, we can get thread-ids
    // using the following intrinsic instead of using
    // per-thread arguments
    uint h_pos = get_thread_origin_x();
    uint v_pos = get_thread_origin_y();

    // read 8x32 block
    read(ibuf, h_pos*24, v_pos*6, in);

    // sum up the first 3-row input values by columns
    m = in.select<3,1,24,1>(0,0)
        + in.select<3,1,24,1>(0,3)
        + in.select<3,1,24,1>(0,6);

#pragma unroll
    for (int i = 0; i < 5; ++i) {
        // sum up the m values by rows
        m_out = m.row(0) + m.row(1) + m.row(2);
        out.row(i) = m_out * 0.111f;
        // update one row of m
        m.row(i%3) = in.select<1,1,24,1>(i+3,0)
            + in.select<1,1,24,1>(i+3,3)
            + in.select<1,1,24,1>(i+3,6);
    }
    // sum up the m values by rows
    m_out = m.row(0) + m.row(1) + m.row(2);
    out.row(5) = m_out * 0.111f;
    // write 6x24 block
    write(obuf, h_pos*24, v_pos*6, out);
}