use buffer reads in input engine

m4-patches
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
use buffer reads in input engine

From:	Eric Blake
Subject:	use buffer reads in input engine
Date:	Fri, 29 Feb 2008 22:01:08 +0000 (UTC)
User-agent:	Loom/3.14 (http://gmane.org/)
Here's my first draft of implementing buffer reads, based on the fallout of 
Bruno's discussion on the m4-discuss list that making a function call per byte 
of input is rather expensive.  So far, this just helps with comments resulting 
from argument expansion, and with multi-byte delimiters, neither of which is 
common, but even without using freadahead for getting buffers from files and 
(the as-yet-unwritten gnulib module) memchr2 for quickly handling quoted 
strings from a buffer, it shows good potential for faster execution.

From: Eric Blake <address@hidden>
Date: Fri, 29 Feb 2008 14:39:35 -0700
Subject: [PATCH] Stage29: read input by buffers, not bytes

---
 NEWS        |    2 +
 src/input.c |  175 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/NEWS b/NEWS
index 6416077..32153d1 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,8 @@ Foundation, Inc.
 * Noteworthy changes in Version 1.4.11 (????-??-??) [stable]
   Released by ????, based on git version 1.4.10b.x-*
 
+** Improve the speed of the input engine.
+
 ** Fix the `m4wrap' builtin to accumulate wrapped text in FIFO order, as
    required by POSIX.  The manual mentions a way to restore the LIFO order
    present in earlier GNU M4 versions.
diff --git a/src/input.c b/src/input.c
index d9d3551..c850e99 100644
--- a/src/input.c
+++ b/src/input.c
@@ -870,6 +870,144 @@ input_print (struct obstack *obs)
 }
 
 
+/*-------------------------------------------------------------------.
+| Return a pointer to the available bytes of the current input       |
+| block, and set *LEN to the length of the result.  If ALLOW_QUOTE,  |
+| do not return a buffer for a quoted string.  If the result of      |
+| next_char() would not fit in an unsigned char (for example,        |
+| CHAR_EOF or CHAR_QUOTE), or if the input block does not have an    |
+| available buffer at the moment (for example, when hitting a buffer |
+| block boundary of a file), return NULL, and the caller must fall   |
+| back on using next_char().  The buffer is only valid until the     |
+| next consume_buffer() or next_char().  When searching for a        |
+| particular byte, it is more efficient to search a buffer at a time |
+| than it is to repeatedly call next_char.                           |
+`-------------------------------------------------------------------*/
+
+static const char *
+next_buffer (size_t *len, bool allow_quote)
+{
+  token_chain *chain;
+
+  while (1)
+    {
+      assert (isp);
+      if (input_change)
+       {
+         current_file = isp->file;
+         current_line = isp->line;
+         input_change = false;
+       }
+
+      switch (isp->type)
+       {
+       case INPUT_STRING:
+         if (isp->u.u_s.len)
+           {
+             *len = isp->u.u_s.len;
+             return isp->u.u_s.str;
+           }
+         break;
+
+       case INPUT_FILE:
+         // TODO - use freadahead, freadptr, and freadseek for optimization
+         return NULL;
+
+       case INPUT_CHAIN:
+         chain = isp->u.u_c.chain;
+         while (chain)
+           {
+             if (allow_quote && chain->quote_age == current_quote_age)
+               return NULL; /* CHAR_QUOTE doesn't fit in buffer.  */
+             switch (chain->type)
+               {
+               case CHAIN_STR:
+                 if (chain->u.u_s.len)
+                   {
+                     *len = chain->u.u_s.len;
+                     return chain->u.u_s.str;
+                   }
+                 if (chain->u.u_s.level >= 0)
+                   adjust_refcount (chain->u.u_s.level, false);
+                 break;
+               case CHAIN_FUNC:
+                 if (chain->u.func)
+                   return NULL; /* CHAR_MACRO doesn't fit in buffer.  */
+                 break;
+               case CHAIN_ARGV:
+                 if (chain->u.u_a.index == arg_argc (chain->u.u_a.argv))
+                   {
+                     arg_adjust_refcount (chain->u.u_a.argv, false);
+                     break;
+                   }
+                 return NULL; /* No buffer to provide.  */
+               case CHAIN_LOC:
+                 isp->file = chain->u.u_l.file;
+                 isp->line = chain->u.u_l.line;
+                 input_change = true;
+                 isp->u.u_c.chain = chain->next;
+                 return next_buffer (len, allow_quote);
+               default:
+                 assert (!"next_buffer");
+                 abort ();
+               }
+             isp->u.u_c.chain = chain = chain->next;
+           }
+         break;
+
+       case INPUT_EOF:
+         return NULL; /* CHAR_EOF doesn't fit in buffer.  */
+
+       default:
+         assert (!"next_buffer");
+         abort ();
+       }
+
+      /* End of input source --- pop one level.  */
+      pop_input (true);
+    }
+}
+
+/*-----------------------------------------------------------------.
+| Consume LEN bytes from the current input block, as though by LEN |
+| calls to next_char().  LEN must be less than or equal to the     |
+| previous length returned by a successful call to curr_buf().     |
+`-----------------------------------------------------------------*/
+
+static void
+consume_buffer (size_t len)
+{
+  token_chain *chain;
+
+  assert (isp && !input_change && len);
+  switch (isp->type)
+    {
+    case INPUT_STRING:
+      assert (len <= isp->u.u_s.len);
+      isp->u.u_s.len -= len;
+      isp->u.u_s.str += len;
+      break;
+
+    case INPUT_FILE:
+      // TODO - use freadahead, freadptr, and freadseek for optimization
+      assert (!"consume_buffer");
+      abort ();
+
+    case INPUT_CHAIN:
+      chain = isp->u.u_c.chain;
+      assert (chain && chain->type == CHAIN_STR && len <= chain->u.u_s.len);
+      /* Partial consumption invalidates quote age.  */
+      chain->quote_age = 0;
+      chain->u.u_s.len -= len;
+      chain->u.u_s.str += len;
+      break;
+
+    default:
+      assert (!"consume_buffer");
+      abort ();
+    }
+}
+
 /*------------------------------------------------------------------.
 | Low level input is done a character at a time.  The function      |
 | peek_input () is used to look at the next character in the input  |
@@ -1292,11 +1430,22 @@ match_input (const char *s, size_t slen, bool consume)
   int ch;                      /* input character */
   const char *t;
   bool result = false;
+  size_t len;
 
+  /* Try a buffer match first.  */
   assert (slen);
+  t = next_buffer (&len, false);
+  if (t && slen <= len && memcmp (s, t, slen) == 0)
+    {
+      if (consume)
+       consume_buffer (slen);
+      return true;
+    }
+
+  /* Fall back on byte matching.  */
   ch = peek_input (false);
   if (ch != to_uchar (*s))
-    return false;                      /* fail */
+    return false;
 
   if (slen == 1)
     {
@@ -1750,7 +1899,29 @@ next_token (token_data *td, int *line, struct obstack 
*obs, bool allow_argv,
       obstack_grow (obs_td, curr_comm.str1, curr_comm.len1);
       while (1)
        {
-         ch = next_char (false);
+         /* Start with buffer search for potential end delimiter.  */
+         const char *buffer;
+         size_t len;
+         buffer = next_buffer (&len, false);
+         if (buffer)
+           {
+             const char *p = (char *) memchr (buffer, *curr_comm.str2, len);
+             if (p)
+               {
+                 obstack_grow (obs_td, buffer, p - buffer);
+                 ch = to_uchar (*p);
+                 consume_buffer (p - buffer + 1);
+               }
+             else
+               {
+                 consume_buffer (len);
+                 continue;
+               }
+           }
+
+         /* Fall back to byte-wise search.  */
+         else
+           ch = next_char (false);
          if (ch == CHAR_EOF)
            /* Current_file changed to "" if we see CHAR_EOF, use the
               previous value we stored earlier.  */
-- 
1.5.4
[Prev in Thread]
Current Thread
[Next in Thread]
use buffer reads in input engine, Eric Blake <=
Prev by Date: Re: [18/18] argv_ref speedup: reuse argv in recursion
Previous by thread: release 1.4.10b
Index(es):
- Date
- Thread