As an example of where you might want to use DynASM, let us consider the problem of transforming an array of binary structures into an array of slightly different binary structures. For the sake of concreteness, let us assume that such transformation jobs are described by the following C structures:

struct field_info_t
{
  int byte_width;

  int input_offset;
  int input_endianness;

  int output_offset;
  int output_endianness;
};

struct transcode_job_t
{
  const void* input;
  int input_record_size;
  int num_input_records;

  void* output;
  int output_record_size;

  int num_fields;
  field_info_t* fields;
};

Naïve code for performing these jobs might look something like the following:

void run_job(transcode_job_t* job)
{
  const char* input = (const char*)job->input;
  char* output = (char*)job->output;
  for(int r = 0; r < job->num_input_records; ++r)
  {
    for(int f = 0; f < job->num_fields; ++f)
    {
      field_info_t* field = job->fields + f;
      memcpy(output + field->output_offset, input + field->input_offset, field->byte_width);
      if(field->input_endianness != field->output_endianness)
        swap_endianness(output + field->output_offset, output + field->output_offset + field->byte_width - 1); 
    }

    input += job->input_record_size;
    output += job->output_record_size;
  }
}

void swap_endianness(char* first, char* last)
{
  for(; first < last; ++first, --last)
  {
    char tmp = *first;
    *first = *last;
    *last = tmp;
  }
}

If num_input_records is really large and the transcoding needs to be done as fast as mechanically possible, then one idea might be to unroll the inner loop of run_job at runtime using DynASM. The idea is that the resulting code will look something like the following:

void run_job(transcode_job_t* job)
{
  void (*transcode_one_record)(const char*, char*) = make_transcoder(job);

  const char* input = (const char*)job->input;
  char* output = (char*)job->output;
  for(int r = 0; r < job->num_input_records; ++r)
  {
    transcode_one_record(input, output);

    input += job->input_record_size;
    output += job->output_record_size;
  }
}

As the first step toward implementing make_transcoder, we need something to feed into DynASM. The following code is such an input, which we'll assume is in a file called transcode.dasc:

|.arch x64
|.actionlist transcode_actionlist
|.section code

static void emit_transcoder(Dst_DECL, transcode_job_t* job)
{
  for(int f = 0; f < job->num_fields; ++f)
  {
    field_info_t* field = job->fields + f;
    switch(field->byte_width)
    {
    case 4:
|     mov eax, [rcx + field->input_offset]
      if(field->input_endianness != field->output_endianness) {
|       bswap eax
      }
|     mov [rdx + field->output_offset], eax
      break;
    case 8:
|     mov rax, [rcx + field->input_offset]
      if(field->input_endianness != field->output_endianness) {
|       bswap rax
      }
|     mov [rdx + field->output_offset], rax
      break;
    default:
      throw std::exception("TODO: Other byte widths");
    }
  }
| ret
}

With this written, we can can use DynASM to transform it into a file called transcode.h using the following command line:

luajit dynasm.lua --nolineno -o transcode.h transcode.dasc

The resulting file, transcode.h, should look something like the following:

//This file has been pre-processed with DynASM.

//|.arch x64
//|.actionlist transcode_actionlist
static const unsigned char transcode_actionlist[27] = {
  139,129,233,255,15,200,255,137,130,233,255,72,139,129,
  233,255,72,15,200,255,72,137,130,233,255,195,255
};

//|.section code
#define DASM_SECTION_CODE   0
#define DASM_MAXSECTION 	1

static void emit_transcoder(Dst_DECL, transcode_job_t* job)
{
  for(int f = 0; f < job->num_fields; ++f)
  {
    field_info_t* field = job->fields + f;
    switch(field->byte_width)
    {
    case 4:
//|     mov eax, [rcx + field->input_offset]
dasm_put(Dst, 0, field->input_offset);
      if(field->input_endianness != field->output_endianness) {
//|       bswap eax
dasm_put(Dst, 4);
      }
//|     mov [rdx + field->output_offset], eax
dasm_put(Dst, 7, field->output_offset);
      break;
    case 8:
//|     mov rax, [rcx + field->input_offset]
dasm_put(Dst, 11, field->input_offset);
      if(field->input_endianness != field->output_endianness) {
//|       bswap rax
dasm_put(Dst, 16);
      }
//|     mov [rdx + field->output_offset], rax
dasm_put(Dst, 20, field->output_offset);
      break;
    default:
      throw std::exception("TODO: Other byte widths");
    }
  }
//| ret
dasm_put(Dst, 25);
}

With this, we're now able to implement make_transcoder:

#define DASM_FDEF static
#include "dynasm/dasm_proto.h" // For declarations of the dasm_ functions
#include "dynasm/dasm_x86.h"   // For x64 implementations of the dasm_ functions
#include "transcode.h"         // For emit_transcoder
#include <assert.h>            // For assert
#include <Windows.h>           // For VirtualAlloc

void (*make_transcoder(transcode_job_t* job))(const char*, char*)
{
  dasm_State* state;
  int status;
  void* code;
  size_t code_size;

  dasm_init(&state, DASM_MAXSECTION);
  dasm_setup(&state, transcode_actionlist);

  emit_transcoder(&state, job);
  
  status = dasm_link(&state, &code_size);
  assert(status == DASM_S_OK);

  code = VirtualAlloc(NULL, code_size, MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE);
  status = dasm_encode(&state, code);
  assert(status == DASM_S_OK);

  dasm_free(&state);
  return (void(*)(const char*, char*))code;
}