[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
use buffer reads in input engine
From: |
Eric Blake |
Subject: |
use buffer reads in input engine |
Date: |
Fri, 29 Feb 2008 22:01:08 +0000 (UTC) |
User-agent: |
Loom/3.14 (http://gmane.org/) |
Here's my first draft of implementing buffer reads, based on the fallout of
Bruno's discussion on the m4-discuss list that making a function call per byte
of input is rather expensive. So far, this just helps with comments resulting
from argument expansion, and with multi-byte delimiters, neither of which is
common, but even without using freadahead for getting buffers from files and
(the as-yet-unwritten gnulib module) memchr2 for quickly handling quoted
strings from a buffer, it shows good potential for faster execution.
From: Eric Blake <address@hidden>
Date: Fri, 29 Feb 2008 14:39:35 -0700
Subject: [PATCH] Stage29: read input by buffers, not bytes
---
NEWS | 2 +
src/input.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 175 insertions(+), 2 deletions(-)
diff --git a/NEWS b/NEWS
index 6416077..32153d1 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,8 @@ Foundation, Inc.
* Noteworthy changes in Version 1.4.11 (????-??-??) [stable]
Released by ????, based on git version 1.4.10b.x-*
+** Improve the speed of the input engine.
+
** Fix the `m4wrap' builtin to accumulate wrapped text in FIFO order, as
required by POSIX. The manual mentions a way to restore the LIFO order
present in earlier GNU M4 versions.
diff --git a/src/input.c b/src/input.c
index d9d3551..c850e99 100644
--- a/src/input.c
+++ b/src/input.c
@@ -870,6 +870,144 @@ input_print (struct obstack *obs)
}
+/*-------------------------------------------------------------------.
+| Return a pointer to the available bytes of the current input |
+| block, and set *LEN to the length of the result. If ALLOW_QUOTE, |
+| do not return a buffer for a quoted string. If the result of |
+| next_char() would not fit in an unsigned char (for example, |
+| CHAR_EOF or CHAR_QUOTE), or if the input block does not have an |
+| available buffer at the moment (for example, when hitting a buffer |
+| block boundary of a file), return NULL, and the caller must fall |
+| back on using next_char(). The buffer is only valid until the |
+| next consume_buffer() or next_char(). When searching for a |
+| particular byte, it is more efficient to search a buffer at a time |
+| than it is to repeatedly call next_char. |
+`-------------------------------------------------------------------*/
+
+static const char *
+next_buffer (size_t *len, bool allow_quote)
+{
+ token_chain *chain;
+
+ while (1)
+ {
+ assert (isp);
+ if (input_change)
+ {
+ current_file = isp->file;
+ current_line = isp->line;
+ input_change = false;
+ }
+
+ switch (isp->type)
+ {
+ case INPUT_STRING:
+ if (isp->u.u_s.len)
+ {
+ *len = isp->u.u_s.len;
+ return isp->u.u_s.str;
+ }
+ break;
+
+ case INPUT_FILE:
+ // TODO - use freadahead, freadptr, and freadseek for optimization
+ return NULL;
+
+ case INPUT_CHAIN:
+ chain = isp->u.u_c.chain;
+ while (chain)
+ {
+ if (allow_quote && chain->quote_age == current_quote_age)
+ return NULL; /* CHAR_QUOTE doesn't fit in buffer. */
+ switch (chain->type)
+ {
+ case CHAIN_STR:
+ if (chain->u.u_s.len)
+ {
+ *len = chain->u.u_s.len;
+ return chain->u.u_s.str;
+ }
+ if (chain->u.u_s.level >= 0)
+ adjust_refcount (chain->u.u_s.level, false);
+ break;
+ case CHAIN_FUNC:
+ if (chain->u.func)
+ return NULL; /* CHAR_MACRO doesn't fit in buffer. */
+ break;
+ case CHAIN_ARGV:
+ if (chain->u.u_a.index == arg_argc (chain->u.u_a.argv))
+ {
+ arg_adjust_refcount (chain->u.u_a.argv, false);
+ break;
+ }
+ return NULL; /* No buffer to provide. */
+ case CHAIN_LOC:
+ isp->file = chain->u.u_l.file;
+ isp->line = chain->u.u_l.line;
+ input_change = true;
+ isp->u.u_c.chain = chain->next;
+ return next_buffer (len, allow_quote);
+ default:
+ assert (!"next_buffer");
+ abort ();
+ }
+ isp->u.u_c.chain = chain = chain->next;
+ }
+ break;
+
+ case INPUT_EOF:
+ return NULL; /* CHAR_EOF doesn't fit in buffer. */
+
+ default:
+ assert (!"next_buffer");
+ abort ();
+ }
+
+ /* End of input source --- pop one level. */
+ pop_input (true);
+ }
+}
+
+/*-----------------------------------------------------------------.
+| Consume LEN bytes from the current input block, as though by LEN |
+| calls to next_char(). LEN must be less than or equal to the |
+| previous length returned by a successful call to curr_buf(). |
+`-----------------------------------------------------------------*/
+
+static void
+consume_buffer (size_t len)
+{
+ token_chain *chain;
+
+ assert (isp && !input_change && len);
+ switch (isp->type)
+ {
+ case INPUT_STRING:
+ assert (len <= isp->u.u_s.len);
+ isp->u.u_s.len -= len;
+ isp->u.u_s.str += len;
+ break;
+
+ case INPUT_FILE:
+ // TODO - use freadahead, freadptr, and freadseek for optimization
+ assert (!"consume_buffer");
+ abort ();
+
+ case INPUT_CHAIN:
+ chain = isp->u.u_c.chain;
+ assert (chain && chain->type == CHAIN_STR && len <= chain->u.u_s.len);
+ /* Partial consumption invalidates quote age. */
+ chain->quote_age = 0;
+ chain->u.u_s.len -= len;
+ chain->u.u_s.str += len;
+ break;
+
+ default:
+ assert (!"consume_buffer");
+ abort ();
+ }
+}
+
/*------------------------------------------------------------------.
| Low level input is done a character at a time. The function |
| peek_input () is used to look at the next character in the input |
@@ -1292,11 +1430,22 @@ match_input (const char *s, size_t slen, bool consume)
int ch; /* input character */
const char *t;
bool result = false;
+ size_t len;
+ /* Try a buffer match first. */
assert (slen);
+ t = next_buffer (&len, false);
+ if (t && slen <= len && memcmp (s, t, slen) == 0)
+ {
+ if (consume)
+ consume_buffer (slen);
+ return true;
+ }
+
+ /* Fall back on byte matching. */
ch = peek_input (false);
if (ch != to_uchar (*s))
- return false; /* fail */
+ return false;
if (slen == 1)
{
@@ -1750,7 +1899,29 @@ next_token (token_data *td, int *line, struct obstack
*obs, bool allow_argv,
obstack_grow (obs_td, curr_comm.str1, curr_comm.len1);
while (1)
{
- ch = next_char (false);
+ /* Start with buffer search for potential end delimiter. */
+ const char *buffer;
+ size_t len;
+ buffer = next_buffer (&len, false);
+ if (buffer)
+ {
+ const char *p = (char *) memchr (buffer, *curr_comm.str2, len);
+ if (p)
+ {
+ obstack_grow (obs_td, buffer, p - buffer);
+ ch = to_uchar (*p);
+ consume_buffer (p - buffer + 1);
+ }
+ else
+ {
+ consume_buffer (len);
+ continue;
+ }
+ }
+
+ /* Fall back to byte-wise search. */
+ else
+ ch = next_char (false);
if (ch == CHAR_EOF)
/* Current_file changed to "" if we see CHAR_EOF, use the
previous value we stored earlier. */
--
1.5.4
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- use buffer reads in input engine,
Eric Blake <=