summaryrefslogtreecommitdiffstats
path: root/src/translators/btparse/lex_auxiliary.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/translators/btparse/lex_auxiliary.c')
-rw-r--r--src/translators/btparse/lex_auxiliary.c939
1 files changed, 939 insertions, 0 deletions
diff --git a/src/translators/btparse/lex_auxiliary.c b/src/translators/btparse/lex_auxiliary.c
new file mode 100644
index 0000000..8fac463
--- /dev/null
+++ b/src/translators/btparse/lex_auxiliary.c
@@ -0,0 +1,939 @@
+/* ------------------------------------------------------------------------
+@NAME : lex_auxiliary.c
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: The code and global variables here have three main purposes:
+ - maintain the lexical buffer (zztoktext, which
+ traditionally with PCCTS is a static array; I have
+ changed things so that it's dynamically allocated and
+ resized on overflow)
+ - keep track of lexical state that's not handled by PCCTS
+ code (like "where are we in terms of BibTeX entries?" or
+ "what are the delimiters for the current entry/string?")
+ - everything called from lexical actions is here, to keep
+ the grammar file itself neat and clean
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : Greg Ward, 1996/07/25-28
+@MODIFIED : Jan 1997
+ Jun 1997
+@VERSION : $Id: lex_auxiliary.c,v 1.31 1999/11/29 01:13:10 greg Rel $
+@COPYRIGHT : Copyright (c) 1996-99 by Gregory P. Ward. All rights reserved.
+
+ This file is part of the btparse library. This library is
+ free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+-------------------------------------------------------------------------- */
+
+/*#include "bt_config.h"*/
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <assert.h>
+#include "lex_auxiliary.h"
+#include "stdpccts.h"
+#include "error.h"
+#include "prototypes.h"
+/*#include "my_dmalloc.h"*/
+
+#define DUPE_TEXT 0
+
+extern char * InputFilename; /* from input.c */
+
+GEN_PRIVATE_ERRFUNC (lexical_warning, (const char * fmt, ...),
+ BTERR_LEXWARN, InputFilename, zzline, NULL, -1, fmt)
+GEN_PRIVATE_ERRFUNC (lexical_error, (const char * fmt, ...),
+ BTERR_LEXERR, InputFilename, zzline, NULL, -1, fmt)
+
+
+
+/* ----------------------------------------------------------------------
+ * Global variables
+ */
+
+/* First, the lexical buffer. This is used elsewhere, so can't be static */
+char * zztoktext = NULL;
+
+/*
+ * Now, the lexical state -- first, stuff that arises from scanning
+ * at top-level and the beginnings of entries;
+ * EntryState:
+ * toplevel when we start scanning a file, or when we are in in_entry
+ * mode and see '}' or ')'
+ * after_at when we are in toplevel mode and see an '@'
+ * after_type when we are in after_at mode and see a name (!= 'comment')
+ * in_comment when we are in after_at mode and see a name (== 'comment')
+ * in_entry when we are in after_type mode and see '{' or '('
+ * EntryOpener:
+ * the character ('(' or '{') which opened the entry currently being
+ * scanned (we use this to make sure that the entry opener and closer
+ * match; if not, we issue a warning)
+ * EntryMetatype: (NB. typedef for bt_metatype is in btparse.h)
+ * classifies entries according to the syntax we will use to parse them;
+ * also winds up (after being changed to a bt_nodetype value) in the
+ * node that roots the entry AST:
+ * comment - anything between () or {}
+ * preamble - a single compound value
+ * string - a list of "name = compound_value" assignments; no key
+ * alias - a single "name = compound_value" assignment (where
+ * the compound value in this case is presumably a
+ * name, rather than a string -- this is not syntactically
+ * checked though)
+ * modify,
+ * entry - a key followed by a list of "name = compound_value"
+ * assignments
+ * JunkCount:
+ * the number of non-whitespace, non-'@' characters seen at toplevel
+ * between two entries (used to print out a warning when we hit
+ * the beginning of entry, to help people catch "old style" implicit
+ * comments
+ */
+static enum { toplevel, after_at, after_type, in_comment, in_entry }
+ EntryState;
+static char EntryOpener; /* '(' or '{' */
+static bt_metatype
+ EntryMetatype;
+static int JunkCount; /* non-whitespace chars at toplevel */
+
+/*
+ * String state -- these are maintained and used by the functions called
+ * from actions in the string lexer.
+ * BraceDepth:
+ * brace depth within a string; we can only end the current string
+ * when this is zero
+ * ParenDepth:
+ * parenthesis depth within a string; needed for @comment entries
+ * that are paren-delimited (because the comment in that case is
+ * a paren-delimited string)
+ * StringOpener:
+ * similar to EntryOpener, but stronger than merely warning of token
+ * mismatch -- this determines which character ('"' or '}') can
+ * actually end the string
+ * StringStart:
+ * line on which current string started; if we detect an apparent
+ * runaway, this is used to report where the runaway started
+ * ApparentRunaway:
+ * flags if we have already detected (and warned) that the current
+ * string appears to be a runaway, so that we don't warn again
+ * (and again and again and again)
+ * QuoteWarned:
+ * flags if we have already warned about seeing a '"' in a string,
+ * because they tend to come in pairs and one warning per string
+ * is enough
+ *
+ * (See bibtex.g for an explanation of my runaway string detection heuristic.)
+ */
+static char StringOpener = '\0'; /* '{' or '"' */
+static int BraceDepth; /* depth of brace-nesting */
+static int ParenDepth; /* depth of parenthesis-nesting */
+static int StringStart = -1; /* start line of current string */
+static int ApparentRunaway; /* current string looks like runaway */
+static int QuoteWarned; /* already warned about " in string? */
+
+
+
+/* ----------------------------------------------------------------------
+ * Miscellaneous functions:
+ * lex_info() (handy for debugging)
+ * zzcr_attr() (called from PCCTS-generated code)
+ */
+
+void lex_info (void)
+{
+ printf ("LA(1) = \"%s\" token %d, %s\n", LATEXT(1), LA(1), zztokens[LA(1)]);
+#ifdef LL_K
+ printf ("LA(2) = \"%s\" token %d, %s\n", LATEXT(2), LA(2), zztokens[LA(2)]);
+#endif
+}
+
+
+void zzcr_attr (Attrib *a, int tok, char *txt)
+{
+ if (tok == STRING)
+ {
+ int len = strlen (txt);
+
+ assert ((txt[0] == '{' && txt[len-1] == '}')
+ || (txt[0] == '"' && txt[len-1] == '"'));
+ txt[len-1] = (char) 0; /* remove closing quote from string */
+ txt++; /* so we'll skip the opening quote */
+ }
+
+#if DUPE_TEXT
+ a->text = strdup (txt);
+#else
+ a->text = txt;
+#endif
+ a->token = tok;
+ a->line = zzline;
+ a->offset = zzbegcol;
+#if DEBUG > 1
+ dprintf ("zzcr_attr: input txt = %p (%s)\n", txt, txt);
+ dprintf (" dupe txt = %p (%s)\n", a->text, a->text);
+#endif
+}
+
+
+#if DUPE_TEXT
+void zzd_attr (Attrib *attr)
+{
+ free (attr->text);
+}
+#endif
+
+
+/* ----------------------------------------------------------------------
+ * Lexical buffer functions:
+ * alloc_lex_buffer()
+ * realloc_lex_buffer()
+ * free_lex_buffer()
+ * lexer_overflow()
+ * zzcopy() (only if ZZCOPY_FUNCTION is defined and true)
+ */
+
+
+/*
+ * alloc_lex_buffer()
+ *
+ * allocates the lexical buffer with `size' characters. Clears the buffer,
+ * points zzlextext at it, and sets zzbufsize to `size'.
+ *
+ * Does nothing if the buffer is already allocated.
+ *
+ * globals: zztoktext, zzlextext, zzbufsize
+ * callers: bt_parse_entry() (in input.c)
+ */
+void alloc_lex_buffer (int size)
+{
+ if (zztoktext == NULL)
+ {
+ zztoktext = (char *) malloc (size * sizeof (char));
+ memset (zztoktext, 0, size);
+ zzlextext = zztoktext;
+ zzbufsize = size;
+ }
+} /* alloc_lex_buffer() */
+
+
+/*
+ * realloc_lex_buffer()
+ *
+ * Reallocates the lexical buffer -- size is increased by `size_increment'
+ * characters (which could be negative). Updates all globals that point
+ * to or into the buffer (zzlextext, zzbegexpr, zzendexpr), as well as
+ * zztoktext (the buffer itself) zzbufsize (the buffer size).
+ *
+ * This is only meant to be called (ultimately) from zzgettok(), part of
+ * the DLG code. (In fact, zzgettok() invokes the ZZCOPY() macro, which
+ * calls lexer_overflow() on buffer overflow, which calls
+ * realloc_lex_buffer(). Whatever.) The `lastpos' and `nextpos' arguments
+ * correspond, respectively, to a local variable in zzgettok() and a static
+ * global in dlgauto.h (hence really in scan.c). They both point into
+ * the lexical buffer, so have to be passed by reference here so that
+ * we can update them to point into the newly-reallocated buffer.
+ *
+ * globals: zztottext, zzbufsize, zzlextext, zzbegexpr, zzendexpr
+ * callers: lexer_overflow()
+ */
+static void
+realloc_lex_buffer (int size_increment,
+ unsigned char ** lastpos,
+ unsigned char ** nextpos)
+{
+ int beg, end, next;
+
+ if (zztoktext == NULL)
+ internal_error ("attempt to reallocate unallocated lexical buffer");
+
+ zztoktext = (char *) realloc (zztoktext, zzbufsize+size_increment);
+ memset (zztoktext+zzbufsize, 0, size_increment);
+ zzbufsize += size_increment;
+
+ beg = zzbegexpr - zzlextext;
+ end = zzendexpr - zzlextext;
+ next = *nextpos - zzlextext;
+ zzlextext = zztoktext;
+
+ if (lastpos != NULL)
+ *lastpos = zzlextext+zzbufsize-1;
+ zzbegexpr = zzlextext + beg;
+ zzendexpr = zzlextext + end;
+ *nextpos = zzlextext + next;
+
+} /* realloc_lex_buffer() */
+
+
+/*
+ * free_lex_buffer()
+ *
+ * Frees the lexical buffer allocated by alloc_lex_buffer().
+ */
+void free_lex_buffer (void)
+{
+ if (zztoktext == NULL)
+ internal_error ("attempt to free unallocated (or already freed) "
+ "lexical buffer");
+
+ free (zztoktext);
+ zztoktext = NULL;
+} /* free_lex_buffer() */
+
+
+/*
+ * lexer_overflow()
+ *
+ * Prints a warning and calls realloc_lex_buffer() to increase the size
+ * of the lexical buffer by ZZLEXBUFSIZE (a constant -- hence the buffer
+ * size increases linearly, not exponentially).
+ *
+ * Also prints a couple of lines of useful debugging stuff if DEBUG is true.
+ */
+void lexer_overflow (unsigned char **lastpos, unsigned char **nextpos)
+{
+#if DEBUG
+ char head[16], tail[16];
+
+ printf ("zzcopy: overflow detected\n");
+ printf (" zzbegcol=%d, zzendcol=%d, zzline=%d\n",
+ zzbegcol, zzendcol, zzline);
+ strncpy (head, zzlextext, 15); head[15] = 0;
+ strncpy (tail, zzlextext+ZZLEXBUFSIZE-15, 15); tail[15] = 0;
+ printf (" zzlextext=>%s...%s< (last char=%d (%c))\n",
+ head, tail,
+ zzlextext[ZZLEXBUFSIZE-1], zzlextext[ZZLEXBUFSIZE-1]);
+ printf (" zzchar = %d (%c), zzbegexpr=zzlextext+%d\n",
+ zzchar, zzchar, zzbegexpr-zzlextext);
+#endif
+
+ notify ("lexical buffer overflowed (reallocating to %d bytes)",
+ zzbufsize+ZZLEXBUFSIZE);
+ realloc_lex_buffer (ZZLEXBUFSIZE, lastpos, nextpos);
+
+} /* lexer_overflow () */
+
+
+#if ZZCOPY_FUNCTION
+/*
+ * zzcopy()
+ *
+ * Does the same as the ZZCOPY macro (in lex_auxiliary.h), but as a
+ * function for easier debugging.
+ */
+void zzcopy (char **nextpos, char **lastpos, int *ovf_flag)
+{
+ if (*nextpos >= *lastpos)
+ {
+ lexer_overflow (lastpos, nextpos);
+ }
+
+ **nextpos = zzchar;
+ (*nextpos)++;
+}
+#endif
+
+
+
+/* ----------------------------------------------------------------------
+ * Report/maintain lexical state
+ * report_state() (only meaningful if DEBUG)
+ * initialize_lexer_state()
+ *
+ * Note that the lexical action functions, below, also fiddle with
+ * the lexical state variables an awful lot.
+ */
+
+#if DEBUG
+char *state_names[] =
+ { "toplevel", "after_at", "after_type", "in_comment", "in_entry" };
+char *metatype_names[] =
+ { "unknown", "comment", "preamble", "string", "alias", "modify", "entry" };
+
+static void
+report_state (char *where)
+{
+ printf ("%s: lextext=%s (line %d, offset %d), token=%d, "
+ "EntryState=%s\n",
+ where, zzlextext, zzline, zzbegcol, NLA,
+ state_names[EntryState]);
+}
+#else
+# define report_state(where)
+/*
+static void
+report_state (char *where) { }
+*/
+#endif
+
+void initialize_lexer_state (void)
+{
+ zzmode (START);
+ EntryState = toplevel;
+ EntryOpener = (char) 0;
+ EntryMetatype = BTE_UNKNOWN;
+ JunkCount = 0;
+}
+
+
+bt_metatype entry_metatype (void)
+{
+ return EntryMetatype;
+}
+
+
+
+/* ----------------------------------------------------------------------
+ * Lexical actions (START and LEX_ENTRY modes)
+ */
+
+/*
+ * newline ()
+ *
+ * Does everything needed to handle newline outside of a quoted string:
+ * increments line counter and skips the newline.
+ */
+void newline (void)
+{
+ zzline++;
+ zzskip();
+}
+
+
+void comment (void)
+{
+ zzline++;
+ zzskip();
+}
+
+
+void at_sign (void)
+{
+ if (EntryState == toplevel)
+ {
+ EntryState = after_at;
+ zzmode (LEX_ENTRY);
+ if (JunkCount > 0)
+ {
+ lexical_warning ("%d characters of junk seen at toplevel", JunkCount);
+ JunkCount = 0;
+ }
+ }
+ else
+ {
+ /* internal_error ("lexer recognized \"@\" at other than top-level"); */
+ lexical_warning ("\"@\" in strange place -- should get syntax error");
+ }
+ report_state ("at_sign");
+}
+
+
+void toplevel_junk (void)
+{
+ JunkCount += strlen (zzlextext);
+ zzskip ();
+}
+
+
+void name (void)
+{
+ report_state ("name (pre)");
+
+ switch (EntryState)
+ {
+ case toplevel:
+ {
+ internal_error ("junk at toplevel (\"%s\")", zzlextext);
+ break;
+ }
+ case after_at:
+ {
+ char * etype = zzlextext;
+ EntryState = after_type;
+
+ if (strcasecmp (etype, "comment") == 0)
+ {
+ EntryMetatype = BTE_COMMENT;
+ EntryState = in_comment;
+ }
+
+ else if (strcasecmp (etype, "preamble") == 0)
+ EntryMetatype = BTE_PREAMBLE;
+
+ else if (strcasecmp (etype, "string") == 0)
+ EntryMetatype = BTE_MACRODEF;
+/*
+ else if (strcasecmp (etype, "alias") == 0)
+ EntryMetatype = BTE_ALIAS;
+
+ else if (strcasecmp (etype, "modify") == 0)
+ EntryMetatype = BTE_MODIFY;
+*/
+ else
+ EntryMetatype = BTE_REGULAR;
+
+ break;
+ }
+ case after_type:
+ case in_comment:
+ case in_entry:
+ break; /* do nothing */
+ }
+
+ report_state ("name (post)");
+
+}
+
+
+void lbrace (void)
+{
+ /*
+ * Currently takes a restrictive view of "when an lbrace is an entry
+ * opener" -- ie. *only* after '@name' (as determined by EntryState),
+ * where name is not 'comment'. This means that lbrace usually
+ * determines a string (in particular, when it's seen at toplevel --
+ * which will happen under certain error situations), which in turn
+ * means that some unexpected things can become strings (like whole
+ * entries).
+ */
+
+ if (EntryState == in_entry || EntryState == in_comment)
+ {
+ start_string ('{');
+ }
+ else if (EntryState == after_type)
+ {
+ EntryState = in_entry;
+ EntryOpener = '{';
+ NLA = ENTRY_OPEN;
+ }
+ else
+ {
+ lexical_warning ("\"{\" in strange place -- should get a syntax error");
+ }
+
+ report_state ("lbrace");
+}
+
+
+void rbrace (void)
+{
+ if (EntryState == in_entry)
+ {
+ if (EntryOpener == '(')
+ lexical_warning ("entry started with \"(\", but ends with \"}\"");
+ NLA = ENTRY_CLOSE;
+ initialize_lexer_state ();
+ }
+ else
+ {
+ lexical_warning ("\"}\" in strange place -- should get a syntax error");
+ }
+ report_state ("rbrace");
+}
+
+
+void lparen (void)
+{
+ if (EntryState == in_comment)
+ {
+ start_string ('(');
+ }
+ else if (EntryState == after_type)
+ {
+ EntryState = in_entry;
+ EntryOpener = '(';
+ }
+ else
+ {
+ lexical_warning ("\"(\" in strange place -- should get a syntax error");
+ }
+ report_state ("lparen");
+}
+
+
+void rparen (void)
+{
+ if (EntryState == in_entry)
+ {
+ if (EntryOpener == '{')
+ lexical_warning ("entry started with \"{\", but ends with \")\"");
+ initialize_lexer_state ();
+ }
+ else
+ {
+ lexical_warning ("\")\" in strange place -- should get a syntax error");
+ }
+ report_state ("rparen");
+}
+
+
+/* ----------------------------------------------------------------------
+ * Stuff for processing strings.
+ */
+
+
+/*
+ * start_string ()
+ *
+ * Called when we see a '{' or '"' in the field data. Records which quote
+ * character was used, and calls open_brace() to increment the depth
+ * counter if it was a '{'. Switches to LEX_STRING mode, and tells the
+ * lexer to continue slurping characters into the same buffer.
+ */
+void start_string (char start_char)
+{
+ StringOpener = start_char;
+ BraceDepth = 0;
+ ParenDepth = 0;
+ StringStart = zzline;
+ ApparentRunaway = 0;
+ QuoteWarned = 0;
+ if (start_char == '{')
+ open_brace ();
+ if (start_char == '(')
+ ParenDepth++;
+ if (start_char == '"' && EntryState == in_comment)
+ {
+ lexical_error ("comment entries must be delimited by either braces or parentheses");
+ EntryState = toplevel;
+ zzmode (START);
+ return;
+ }
+
+#ifdef USER_ZZMODE_STACK
+ if (zzauto != LEX_ENTRY || EntryState != in_entry)
+#else
+ if (EntryState != in_entry && EntryState != in_comment)
+#endif
+ {
+ lexical_warning ("start of string seen at weird place");
+ }
+
+ zzmore ();
+ zzmode (LEX_STRING);
+}
+
+
+/*
+ * end_string ()
+ *
+ * Called when we see either a '"' (at depth 0) or '}' (if it brings us
+ * down to depth 0) in a quoted string. Just makes sure that braces are
+ * balanced, and then goes back to the LEX_FIELD mode.
+ */
+void end_string (char end_char)
+{
+ char match;
+
+#ifndef ALLOW_WARNINGS
+ match = (char) 0; /* silence "might be used" */
+ /* uninitialized" warning */
+#endif
+
+ switch (end_char)
+ {
+ case '}': match = '{'; break;
+ case ')': match = '('; break;
+ case '"': match = '"'; break;
+ default:
+ internal_error ("end_string(): invalid end_char \"%c\"", end_char);
+ }
+
+ assert (StringOpener == match);
+
+ /*
+ * If we're at non-zero BraceDepth, that probably means mismatched braces
+ * somewhere -- complain about it and reset BraceDepth to minimize future
+ * confusion.
+ */
+
+ if (BraceDepth > 0)
+ {
+ lexical_error ("unbalanced braces: too many {'s");
+ BraceDepth = 0;
+ }
+
+ StringOpener = (char) 0;
+ StringStart = -1;
+ NLA = STRING;
+
+ if (EntryState == in_comment)
+ {
+ int len = strlen (zzlextext);
+
+ /*
+ * ARG! no, this is wrong -- what if unbalanced braces in the string
+ * and we try to output put it later?
+ *
+ * ARG! again, this is no more wrong than when we strip quotes in
+ * post_parse.c, and blithely assume that we can put them back on
+ * later for output in BibTeX syntax. Hmmm.
+ *
+ * Actually, it looks like this isn't a problem after all: you
+ * can't have unbalanced braces in a BibTeX string (at least
+ * not as parsed by btparse).
+ */
+
+ if (zzlextext[0] == '(') /* convert to standard quote delims */
+ {
+ zzlextext[ 0] = '{';
+ zzlextext[len-1] = '}';
+ }
+
+ EntryState = toplevel;
+ zzmode (START);
+ }
+ else
+ {
+ zzmode (LEX_ENTRY);
+ }
+
+ report_state ("string");
+}
+
+
+/*
+ * open_brace ()
+ *
+ * Called when we see a '{', either to start a string (in which case
+ * it's called from start_string()) or inside a string (called directly
+ * from the lexer).
+ */
+void open_brace (void)
+{
+ BraceDepth++;
+ zzmore ();
+ report_state ("open_brace");
+}
+
+
+/*
+ * close_brace ()
+ *
+ * Called when we see a '}' inside a string. Decrements the depth counter
+ * and checks to see if we are down to depth 0, in which case the string is
+ * ended and the current lookahead token is set to STRING. Otherwise,
+ * just tells the lexer to keep slurping characters into the buffer.
+ */
+void close_brace (void)
+{
+ BraceDepth--;
+ if (StringOpener == '{' && BraceDepth == 0)
+ {
+ end_string ('}');
+ }
+
+ /*
+ * This could happen if some bonehead puts an unmatched right-brace
+ * in a quote-delimited string (eg. "Hello}"). To attempt to recover,
+ * we reset the depth to zero and continue slurping into the string.
+ */
+ else if (BraceDepth < 0)
+ {
+ lexical_error ("unbalanced braces: too many }'s");
+ BraceDepth = 0;
+ zzmore ();
+ }
+
+ /* Otherwise, it's just any old right brace in a string -- keep eating */
+ else
+ {
+ zzmore ();
+ }
+ report_state ("close_brace");
+}
+
+
+void lparen_in_string (void)
+{
+ ParenDepth++;
+ zzmore ();
+}
+
+
+void rparen_in_string (void)
+{
+ ParenDepth--;
+ if (StringOpener == '(' && ParenDepth == 0)
+ {
+ end_string (')');
+ }
+ else
+ {
+ zzmore ();
+ }
+}
+
+
+/*
+ * quote_in_string ()
+ *
+ * Called when we see '"' in a string. Ends the string if the quote is at
+ * depth 0 and the string was started with a quote, otherwise instructs the
+ * lexer to continue munching happily along. (Also prints a warning,
+ * assuming that input is destined for processing by TeX and you really
+ * want either `` or '' rather than ".)
+ */
+void quote_in_string (void)
+{
+ if (StringOpener == '"' && BraceDepth == 0)
+ {
+ end_string ('"');
+ }
+ else
+ {
+ boolean at_top = FALSE;;
+
+ /*
+ * Note -- this warning assumes that strings are destined
+ * to be processed by TeX, so it should be optional. Hmmm.
+ */
+
+ if (StringOpener == '"' || StringOpener == '(')
+ at_top = (BraceDepth == 0);
+ else if (StringOpener == '{')
+ at_top = (BraceDepth == 1);
+ else
+ internal_error ("Illegal string opener \"%c\"", StringOpener);
+
+ if (!QuoteWarned && at_top)
+ {
+ lexical_warning ("found \" at brace-depth zero in string "
+ "(TeX accents in BibTeX should be inside braces)");
+ QuoteWarned = 1;
+ }
+ zzmore ();
+ }
+}
+
+
+/*
+ * check_runaway_string ()
+ *
+ * Called from the lexer whenever we see a newline in a string. See
+ * bibtex.g for a detailed explanation; basically, this function
+ * looks for an entry start ("@name{") or new field ("name=") immediately
+ * after a newline (with possible whitespace). This is a heuristic
+ * check for runaway strings, under the assumption that text that looks
+ * like a new entry or new field won't actually occur inside a string
+ * very often.
+ */
+void check_runaway_string (void)
+{
+ int len;
+ int i;
+
+ /*
+ * could these be made significantly more efficient by a 256-element
+ * lookup table instead of calling strchr()?
+ */
+ static const char *alpha_chars = "abcdefghijklmnopqrstuvwxyz";
+ static const char *name_chars = "abcdefghijklmnopqrstuvwxyz0123456789:+/'.-";
+
+ /*
+ * on entry: zzlextext contains the whole string, starting with {
+ * and with newlines/tabs converted to space; zzbegexpr points to
+ * a chunk of the string starting with newline (newlines and
+ * tabs have not yet been converted)
+ */
+
+#if DEBUG > 1
+ printf ("check_runaway_string(): zzline=%d\n", zzline);
+ printf ("zzlextext=>%s<\nzzbegexpr=>%s<\n",
+ zzlextext, zzbegexpr);
+#endif
+
+
+ /*
+ * increment zzline to take the leading newline into account -- but
+ * first a sanity check to be sure that newline is there!
+ */
+
+ if (zzbegexpr[0] != '\n')
+ {
+ lexical_warning ("huh? something's wrong (buffer overflow?) near "
+ "offset %d (line %d)", zzendcol, zzline);
+ /* internal_error ("zzbegexpr (line %d, offset %d-%d, "
+ "text >%s<, expr >%s<)"
+ "should start with a newline",
+ zzline, zzbegcol, zzendcol, zzlextext, zzbegexpr);
+ */
+ }
+ else
+ {
+ zzline++;
+ }
+
+ /* standardize whitespace (convert all to space) */
+
+ len = strlen (zzbegexpr);
+ for (i = 0; i < len; i++)
+ {
+ if (isspace (zzbegexpr[i]))
+ zzbegexpr[i] = ' ';
+ }
+
+
+ if (!ApparentRunaway) /* haven't already warned about it */
+ {
+ enum { none, entry, field, giveup } guess;
+
+ i = 1;
+ guess = none;
+ while (i < len && zzbegexpr[i] == ' ') i++;
+
+ if (zzbegexpr[i] == '@')
+ {
+ i++;
+ while (i < len && zzbegexpr[i] == ' ') i++;
+ guess = entry;
+ }
+
+ if (strchr (alpha_chars, tolower (zzbegexpr[i])) != NULL)
+ {
+ while (i < len && strchr (name_chars, tolower (zzbegexpr[i])) != NULL)
+ i++;
+ while (i < len && zzbegexpr[i] == ' ') i++;
+ if (i == len)
+ {
+ guess = giveup;
+ }
+ else
+ {
+ if (guess == entry)
+ {
+ if (zzbegexpr[i] != '{' && zzbegexpr[i] != '(')
+ guess = giveup;
+ }
+ else /* assume it's a field */
+ {
+ if (zzbegexpr[i] == '=')
+ guess = field;
+ else
+ guess = giveup;
+ }
+ }
+ }
+ else /* no name seen after WS or @ */
+ {
+ guess = giveup;
+ }
+
+ if (guess == none)
+ internal_error ("gee, I should have made a guess by now");
+
+ if (guess != giveup)
+ {
+ lexical_warning ("possible runaway string started at line %d",
+ StringStart);
+ ApparentRunaway = 1;
+ }
+ }
+
+ zzmore();
+}
+