Last time, I bemoaned what compilers did to some of the CPython interpreter main loop. Following those remarks, there are three obvious courses of action:

  1. Make targeted improvements to the compilers.
  2. Write the interpreter main loop directly in assembly.
  3. Tweak the C source code to make it more amenable to good compilation.

Option three is the easiest to explore, so let's start with a random benchmark to use as a baseline:

Python 3.6.0+ (default, Mar  7 2017, 00:04:40) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from performance.benchmarks.bm_nbody import bench_nbody
>>> bench_nbody(10, 'sun', 100000)
9.907924063038081

The following patch is a little long, but each individual change is relatively boring, and all the changes are motivated by what we saw in gcc's assembly:

diff --git a/Python/ceval.c b/Python/ceval.c
index d5172b9..79ccf2a 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -729,7 +729,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
     int opcode;        /* Current opcode */
     int oparg;         /* Current opcode argument, if any */
     enum why_code why; /* Reason for block stack unwind */
-    PyObject **fastlocals, **freevars;
+    PyObject **freevars;
     PyObject *retval = NULL;            /* Return value */
     PyThreadState *tstate = PyThreadState_GET();
     PyCodeObject *co;
@@ -865,7 +865,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
 /* Code access macros */
 
 /* The integer overflow is checked by an assertion below. */
-#define INSTR_OFFSET()  (sizeof(_Py_CODEUNIT) * (int)(next_instr - first_instr))
+#define INSTR_OFFSET()  ((char*)next_instr - (char*)first_instr)
 #define NEXTOPARG()  do { \
         _Py_CODEUNIT word = *next_instr; \
         opcode = _Py_OPCODE(word); \
@@ -959,7 +959,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
 
 /* Local variable macros */
 
-#define GETLOCAL(i)     (fastlocals[i])
+#define GETLOCAL(i)     (f->f_localsplus[i])
 
 /* The SETLOCAL() macro must not DECREF the local variable in-place and
    then store the new value; it must copy the old value to a temporary
@@ -1045,7 +1045,6 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
     co = f->f_code;
     names = co->co_names;
     consts = co->co_consts;
-    fastlocals = f->f_localsplus;
     freevars = f->f_localsplus + co->co_nlocals;
     assert(PyBytes_Check(co->co_code));
     assert(PyBytes_GET_SIZE(co->co_code) <= INT_MAX);
@@ -1228,7 +1227,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             FAST_DISPATCH();
 
         TARGET(LOAD_FAST) {
-            PyObject *value = GETLOCAL(oparg);
+            PyObject *value = GETLOCAL((unsigned)oparg);
             if (value == NULL) {
                 format_exc_check_arg(PyExc_UnboundLocalError,
                                      UNBOUNDLOCAL_ERROR_MSG,
@@ -1242,7 +1241,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
 
         PREDICTED(LOAD_CONST);
         TARGET(LOAD_CONST) {
-            PyObject *value = GETITEM(consts, oparg);
+            PyObject *value = GETITEM(consts, (unsigned)oparg);
             Py_INCREF(value);
             PUSH(value);
             FAST_DISPATCH();
@@ -1251,7 +1250,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         PREDICTED(STORE_FAST);
         TARGET(STORE_FAST) {
             PyObject *value = POP();
-            SETLOCAL(oparg, value);
+            SETLOCAL((unsigned)oparg, value);
             FAST_DISPATCH();
         }
 
@@ -1526,7 +1525,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
 
         TARGET(LIST_APPEND) {
             PyObject *v = POP();
-            PyObject *list = PEEK(oparg);
+            PyObject *list = PEEK((size_t)(unsigned)oparg);
             int err;
             err = PyList_Append(list, v);
             Py_DECREF(v);
@@ -1731,7 +1730,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             _Py_IDENTIFIER(__annotations__);
             PyObject *ann_dict;
             PyObject *ann = POP();
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             int err;
             if (f->f_locals == NULL) {
                 PyErr_Format(PyExc_SystemError,
@@ -2155,7 +2154,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(STORE_NAME) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *v = POP();
             PyObject *ns = f->f_locals;
             int err;
@@ -2176,7 +2175,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(DELETE_NAME) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *ns = f->f_locals;
             int err;
             if (ns == NULL) {
@@ -2198,7 +2197,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         TARGET(UNPACK_SEQUENCE) {
             PyObject *seq = POP(), *item, **items;
             if (PyTuple_CheckExact(seq) &&
-                PyTuple_GET_SIZE(seq) == oparg) {
+                PyTuple_GET_SIZE(seq) == (Py_ssize_t)(size_t)(unsigned)oparg) {
                 items = ((PyTupleObject *)seq)->ob_item;
                 while (oparg--) {
                     item = items[oparg];
@@ -2206,7 +2205,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
                     PUSH(item);
                 }
             } else if (PyList_CheckExact(seq) &&
-                       PyList_GET_SIZE(seq) == oparg) {
+                       PyList_GET_SIZE(seq) == (Py_ssize_t)(size_t)(unsigned)oparg) {
                 items = ((PyListObject *)seq)->ob_item;
                 while (oparg--) {
                     item = items[oparg];
@@ -2215,7 +2214,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
                 }
             } else if (unpack_iterable(seq, oparg, -1,
                                        stack_pointer + oparg)) {
-                STACKADJ(oparg);
+                STACKADJ((unsigned)oparg);
             } else {
                 /* unpack_iterable() raised an exception */
                 Py_DECREF(seq);
@@ -2241,7 +2240,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(STORE_ATTR) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *owner = TOP();
             PyObject *v = SECOND();
             int err;
@@ -2255,7 +2254,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(DELETE_ATTR) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *owner = POP();
             int err;
             err = PyObject_SetAttr(owner, name, (PyObject *)NULL);
@@ -2266,7 +2265,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(STORE_GLOBAL) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *v = POP();
             int err;
             err = PyDict_SetItem(f->f_globals, name, v);
@@ -2277,7 +2276,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(DELETE_GLOBAL) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             int err;
             err = PyDict_DelItem(f->f_globals, name);
             if (err != 0) {
@@ -2289,7 +2288,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(LOAD_NAME) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *locals = f->f_locals;
             PyObject *v;
             if (locals == NULL) {
@@ -2340,7 +2339,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(LOAD_GLOBAL) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *v;
             if (PyDict_CheckExact(f->f_globals)
                 && PyDict_CheckExact(f->f_builtins))
@@ -2385,9 +2384,9 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(DELETE_FAST) {
-            PyObject *v = GETLOCAL(oparg);
+            PyObject *v = GETLOCAL((unsigned)oparg);
             if (v != NULL) {
-                SETLOCAL(oparg, NULL);
+                SETLOCAL((unsigned)oparg, NULL);
                 DISPATCH();
             }
             format_exc_check_arg(
@@ -2488,7 +2487,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(BUILD_TUPLE) {
-            PyObject *tup = PyTuple_New(oparg);
+            PyObject *tup = PyTuple_New((unsigned)oparg);
             if (tup == NULL)
                 goto error;
             while (--oparg >= 0) {
@@ -2500,7 +2499,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(BUILD_LIST) {
-            PyObject *list =  PyList_New(oparg);
+            PyObject *list =  PyList_New((unsigned)oparg);
             if (list == NULL)
                 goto error;
             while (--oparg >= 0) {
@@ -2571,7 +2570,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
                     err = PySet_Add(set, item);
                 Py_DECREF(item);
             }
-            STACKADJ(-oparg);
+            STACKADJ(-(size_t)(unsigned)oparg);
             if (err != 0) {
                 Py_DECREF(set);
                 goto error;
@@ -2601,7 +2600,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
 
         TARGET(BUILD_MAP) {
             Py_ssize_t i;
-            PyObject *map = _PyDict_NewPresized((Py_ssize_t)oparg);
+            PyObject *map = _PyDict_NewPresized((size_t)(unsigned)oparg);
             if (map == NULL)
                 goto error;
             for (i = oparg; i > 0; i--) {
@@ -2684,12 +2683,12 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             PyObject *map;
             PyObject *keys = TOP();
             if (!PyTuple_CheckExact(keys) ||
-                PyTuple_GET_SIZE(keys) != (Py_ssize_t)oparg) {
+                PyTuple_GET_SIZE(keys) != (Py_ssize_t)(size_t)(unsigned)oparg) {
                 PyErr_SetString(PyExc_SystemError,
                                 "bad BUILD_CONST_KEY_MAP keys argument");
                 goto error;
             }
-            map = _PyDict_NewPresized((Py_ssize_t)oparg);
+            map = _PyDict_NewPresized((size_t)(unsigned)oparg);
             if (map == NULL) {
                 goto error;
             }
@@ -2746,7 +2745,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             for (i = oparg; i > 0; i--) {
                 PyObject *arg = PEEK(i);
                 if (_PyDict_MergeEx(sum, arg, 2) < 0) {
-                    PyObject *func = PEEK(2 + oparg);
+                    PyObject *func = PEEK(2 + (unsigned)oparg);
                     if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
                         PyErr_Format(PyExc_TypeError,
                                 "%.200s%.200s argument after ** "
@@ -2810,7 +2809,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(LOAD_ATTR) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *owner = TOP();
             PyObject *res = PyObject_GetAttr(owner, name);
             Py_DECREF(owner);
@@ -2835,7 +2834,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(IMPORT_NAME) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *fromlist = POP();
             PyObject *level = TOP();
             PyObject *res;
@@ -2869,7 +2868,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(IMPORT_FROM) {
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *from = TOP();
             PyObject *res;
             res = import_from(from, name);
@@ -2880,7 +2879,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(JUMP_FORWARD) {
-            JUMPBY(oparg);
+            JUMPBY((unsigned)oparg);
             FAST_DISPATCH();
         }
 
@@ -2894,7 +2893,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             }
             if (cond == Py_False) {
                 Py_DECREF(cond);
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
                 FAST_DISPATCH();
             }
             err = PyObject_IsTrue(cond);
@@ -2902,7 +2901,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             if (err > 0)
                 err = 0;
             else if (err == 0)
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
             else
                 goto error;
             DISPATCH();
@@ -2918,14 +2917,14 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             }
             if (cond == Py_True) {
                 Py_DECREF(cond);
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
                 FAST_DISPATCH();
             }
             err = PyObject_IsTrue(cond);
             Py_DECREF(cond);
             if (err > 0) {
                 err = 0;
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
             }
             else if (err == 0)
                 ;
@@ -2943,7 +2942,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
                 FAST_DISPATCH();
             }
             if (cond == Py_False) {
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
                 FAST_DISPATCH();
             }
             err = PyObject_IsTrue(cond);
@@ -2953,7 +2952,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
                 err = 0;
             }
             else if (err == 0)
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
             else
                 goto error;
             DISPATCH();
@@ -2968,13 +2967,13 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
                 FAST_DISPATCH();
             }
             if (cond == Py_True) {
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
                 FAST_DISPATCH();
             }
             err = PyObject_IsTrue(cond);
             if (err > 0) {
                 err = 0;
-                JUMPTO(oparg);
+                JUMPTO((unsigned)oparg);
             }
             else if (err == 0) {
                 STACKADJ(-1);
@@ -2987,7 +2986,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
 
         PREDICTED(JUMP_ABSOLUTE);
         TARGET(JUMP_ABSOLUTE) {
-            JUMPTO(oparg);
+            JUMPTO((unsigned)oparg);
 #if FAST_LOOPS
             /* Enabling this path speeds-up all while and for-loops by bypassing
                the per-loop checks for signals.  By default, this should be turned-off
@@ -3065,7 +3064,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
             /* iterator ended normally */
             STACKADJ(-1);
             Py_DECREF(iter);
-            JUMPBY(oparg);
+            JUMPBY((unsigned)oparg);
             PREDICT(POP_BLOCK);
             DISPATCH();
         }
@@ -3076,7 +3075,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
         }
 
         TARGET(CONTINUE_LOOP) {
-            retval = PyLong_FromLong(oparg);
+            retval = PyLong_FromLong((unsigned)oparg);
             if (retval == NULL)
                 goto error;
             why = WHY_CONTINUE;
@@ -3755,7 +3754,7 @@ format_missing(const char *kind, PyCodeObject *co, PyObject *names)
 
 static void
 missing_arguments(PyCodeObject *co, Py_ssize_t missing, Py_ssize_t defcount,
-                  PyObject **fastlocals)
+                  PyFrameObject *f)
 {
     Py_ssize_t i, j = 0;
     Py_ssize_t start, end;
@@ -3793,7 +3792,7 @@ missing_arguments(PyCodeObject *co, Py_ssize_t missing, Py_ssize_t defcount,
 
 static void
 too_many_positional(PyCodeObject *co, Py_ssize_t given, Py_ssize_t defcount,
-                    PyObject **fastlocals)
+                    PyFrameObject *f)
 {
     int plural;
     Py_ssize_t kwonly_given = 0;
@@ -3863,7 +3862,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
     PyCodeObject* co = (PyCodeObject*)_co;
     PyFrameObject *f;
     PyObject *retval = NULL;
-    PyObject **fastlocals, **freevars;
+    PyObject **freevars;
     PyThreadState *tstate;
     PyObject *x, *u;
     const Py_ssize_t total_args = co->co_argcount + co->co_kwonlyargcount;
@@ -3883,7 +3882,6 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
     if (f == NULL) {
         return NULL;
     }
-    fastlocals = f->f_localsplus;
     freevars = f->f_localsplus + co->co_nlocals;
 
     /* Create a dictionary for keyword parameters (**kwags) */
@@ -3990,7 +3988,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
 
     /* Check the number of positional arguments */
     if (argcount > co->co_argcount && !(co->co_flags & CO_VARARGS)) {
-        too_many_positional(co, argcount, defcount, fastlocals);
+        too_many_positional(co, argcount, defcount, f);
         goto fail;
     }
 
@@ -4004,7 +4002,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
             }
         }
         if (missing) {
-            missing_arguments(co, missing, defcount, fastlocals);
+            missing_arguments(co, missing, defcount, f);
             goto fail;
         }
         if (n > m)
@@ -4039,7 +4037,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
             missing++;
         }
         if (missing) {
-            missing_arguments(co, missing, -1, fastlocals);
+            missing_arguments(co, missing, -1, f);
             goto fail;
         }
     }
@@ -4845,7 +4843,6 @@ _PyFunction_FastCall(PyCodeObject *co, PyObject **args, Py_ssize_t nargs,
 {
     PyFrameObject *f;
     PyThreadState *tstate = PyThreadState_GET();
-    PyObject **fastlocals;
     Py_ssize_t i;
     PyObject *result;
 
@@ -4861,11 +4858,9 @@ _PyFunction_FastCall(PyCodeObject *co, PyObject **args, Py_ssize_t nargs,
         return NULL;
     }
 
-    fastlocals = f->f_localsplus;
-
     for (i = 0; i < nargs; i++) {
         Py_INCREF(*args);
-        fastlocals[i] = *args++;
+        f->f_localsplus[(size_t)i] = *args++;
     }
     result = PyEval_EvalFrameEx(f,0);
 
@@ -5335,9 +5330,8 @@ unicode_concatenate(PyObject *v, PyObject *w,
         switch (opcode) {
         case STORE_FAST:
         {
-            PyObject **fastlocals = f->f_localsplus;
-            if (GETLOCAL(oparg) == v)
-                SETLOCAL(oparg, NULL);
+            if (GETLOCAL((unsigned)oparg) == v)
+                SETLOCAL((unsigned)oparg, NULL);
             break;
         }
         case STORE_DEREF:
@@ -5352,7 +5346,7 @@ unicode_concatenate(PyObject *v, PyObject *w,
         case STORE_NAME:
         {
             PyObject *names = f->f_code->co_names;
-            PyObject *name = GETITEM(names, oparg);
+            PyObject *name = GETITEM(names, (unsigned)oparg);
             PyObject *locals = f->f_locals;
             if (PyDict_CheckExact(locals) &&
                 PyDict_GetItem(locals, name) == v) {

With all of these changes applied, we get a 1.3% speedup:

Python 3.6.0+ (default, Mar  7 2017, 00:06:13) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from performance.benchmarks.bm_nbody import bench_nbody
>>> bench_nbody(10, 'sun', 100000)
9.777307317999657

A 1.3% speedup is simutaneously not very much, and also a surprisingly large amount for eliminating just a few instructions here and there. Of course, your mileage may vary, and this is just one randomly chosen benchmark.