tokenize.c

/****************************************************************************
*
*                            Open Watcom Project
*
*    Portions Copyright (c) 1983-2002 Sybase, Inc. All Rights Reserved.
*
*  ========================================================================
*
*    This file contains Original Code and/or Modifications of Original
*    Code as defined in and that are subject to the Sybase Open Watcom
*    Public License version 1.0 (the 'License'). You may not use this file
*    except in compliance with the License. BY USING THIS FILE YOU AGREE TO
*    ALL TERMS AND CONDITIONS OF THE LICENSE. A copy of the License is
*    provided with the Original Code and Modifications, and is also
*    available at www.sybase.com/developer/opensource.
*
*    The Original Code and all software distributed under the License are
*    distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
*    EXPRESS OR IMPLIED, AND SYBASE AND ALL CONTRIBUTORS HEREBY DISCLAIM
*    ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF
*    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR
*    NON-INFRINGEMENT. Please see the License for the specific language
*    governing rights and limitations under the License.
*
*  ========================================================================
*
* Description:  tokenizer.
*
****************************************************************************/

#include <ctype.h>

#include "globals.h"
#include "memalloc.h"
#include "parser.h"
#include "condasm.h"
#include "reswords.h"
#include "input.h"
#include "segment.h"
#include "listing.h"
#include "tokenize.h"
#include "fastpass.h"
#include "myassert.h"

#define CONCATID 0 /* 0=most compatible (see backsl.asm) */
#define MASMNUMBER 1 /* 1=Masm-compatible number scanning */
#ifdef __I86__
#define TOKSTRALIGN 0 /* 0=don't align token strings */
#else
#define TOKSTRALIGN 1 /* 1=align token strings to sizeof(uint_32) */
#endif

#ifndef DOTNAMEX /* v2.08: added */
/* set DOTNAMEX to 1 if support for Intel C++ generated assembly code
 * is to be enabled.
 */
#define DOTNAMEX 0
#endif

extern struct ReservedWord  ResWordTable[];

#ifdef DEBUG_OUT
int_32 cnttok0;
int_32 cnttok1;
extern struct asm_tok *end_tokenarray;
extern char           *end_stringbuf;
#endif
extern char    *token_stringbuf;  /* start token string buffer */
extern char    *commentbuffer;

/* v2.08: moved to struct line_status */
//static uint_8 g_flags; /* directive flags for current line */

#if !defined(__GNUC__) && !defined(__POCC__)
#define tolower(c) ((c >= 'A' && c <= 'Z') ? c | 0x20 : c )
#endif

/* strings for token 0x28 - 0x2F */
static const short stokstr1[] = {
    '(',')','*','+',',','-','.','/'};
/* strings for token 0x5B - 0x5D */
static const short stokstr2[] = {
    '[',0,']'};

/* test line concatenation if last token is a comma.
 * dont concat EQU, macro invocations or
 * - ECHO
 * - FORC/IRPC (v2.0)
 * - INCLUDE (v2.8)
 * lines!
 * v2.05: don't concat if line's an instruction.
 */
static bool IsMultiLine( struct asm_tok tokenarray[] )
/****************************************************/
{
    struct asym *sym;
    int i;

    if ( tokenarray[1].token == T_DIRECTIVE && tokenarray[1].tokval == T_EQU )
        return( FALSE );
    i = ( tokenarray[1].token == T_COLON ? 2 : 0 );
    /* don't concat macros */
    if ( tokenarray[i].token == T_ID ) {
        sym = SymSearch( tokenarray[i].string_ptr );
        if ( sym && ( sym->state == SYM_MACRO )
#if VARARGML
            && sym->mac_multiline == FALSE  /* v2.11: added */
#endif
           )
            return( FALSE );
    } else if ( tokenarray[i].token == T_INSTRUCTION ||
               ( tokenarray[i].token == T_DIRECTIVE &&
               ( tokenarray[i].tokval == T_ECHO ||
                tokenarray[i].tokval == T_INCLUDE ||
                tokenarray[i].tokval == T_FORC ||
                tokenarray[i].tokval == T_IRPC ) ) ) {
        return( FALSE );
    }
    return( TRUE );
}

static ret_code get_float( struct asm_tok *buf, struct line_status *p )
/*********************************************************************/
{
    /* valid floats look like:  (int)[.(int)][e(int)]
     * Masm also allows hex format, terminated by 'r' (3F800000r)
     */

    char    got_decimal = FALSE;
    char    got_e = FALSE;
    char    *ptr = p->input;

    for( ; *ptr != NULLC; ptr++ ) {
        char c = *ptr;
        if( isdigit( c ) ) {
            ;
        } else if ( c == '.' && got_decimal == FALSE ) {
            got_decimal = TRUE;
        } else if ( tolower( c ) == 'e' && got_e == FALSE ) {
            got_e = TRUE;
            /* accept e+2 / e-4 /etc. */
            if ( *(ptr+1) == '+' || *(ptr+1) == '-' )
                ptr++;
            /* it's accepted if there's no digit behind 'e' */
            //if ( !isdigit( *(ptr+1) ) )
            //    break;
        } else
            break;
    }

    buf->token = T_FLOAT;
    buf->floattype = NULLC;
    memcpy( p->output, p->input, ptr - p->input );
    p->output += ( ptr - p->input );
    *p->output++ = NULLC;
    p->input = ptr;

    /* the binary value isn't used currently */
    //*((float *)(&buf->value)) = atof( buf->string_ptr );

    return( NOT_ERROR );
}

static ret_code ConcatLine( char *src, int cnt, char *out, struct line_status *ls )
/*********************************************************************************/
{
    char *p = src+1;
    int max;

    while ( isspace(*p) ) p++;
    if ( *p == NULLC || *p == ';' ) {
        //char *buffer = GetAlignedPointer( out, strlen( out ) );
        char *buffer = out;
        if( GetTextLine( buffer ) ) {
            p = buffer;
            /* skip leading spaces */
            while ( isspace( *p ) ) p++;
            max = strlen( p );
            if ( cnt == 0 )
                *src++ = ' ';
            if ( ( src - ls->start ) + max >= MAX_LINE_LEN ) {
                EmitError( LINE_TOO_LONG );
                max = MAX_LINE_LEN - ( src - ls->start + 1 );
                *(p+max) = NULLC;
            }
            memcpy( src, p, max+1 );
            return( NOT_ERROR );
        }
    }
    return( EMPTY );
}

static ret_code get_string( struct asm_tok *buf, struct line_status *p )
/**********************************************************************/
{
    char    symbol_o;
    char    symbol_c;
    char    c;
    char    *src = p->input;
    char    *dst = p->output;
    int     count = 0;
    int     level;

    symbol_o = *src;

    switch( symbol_o ) {
    case '"':
    case '\'':
        buf->string_delim = symbol_o;
        *dst++ = symbol_o;
        src++;
        for ( ; count < MAX_STRING_LEN; src++, count++ ) {
            c = *src;
            if( c == symbol_o ) { /* another quote? */
                *dst++ = c; /* store it */
                src++;
                if( *src != c )
                    break; /* exit loop */
                /* a pair of quotes inside the string is
                 * handled as a single quote */
            } else if( c == NULLC ) {
                /* missing terminating quote, change to undelimited string */
                buf->string_delim = NULLC;
                count++; /* count the first quote */
                break;
            } else {
                *dst++ = c;
            }
        }
        break;  /* end of string marker is the same */
    case '{':
        if ( p->flags & TOK_NOCURLBRACES )
            goto undelimited_string;
    case '<':
        buf->string_delim = symbol_o;
        symbol_c = ( symbol_o == '<' ? '>' : '}' );
        src++;
        for( level = 0; count < MAX_STRING_LEN; ) {
            c = *src;
            if( c == symbol_o ) { /* < or { ? */
                level++;
                *dst++ = c; src++;
                count++;
            } else if( c == symbol_c ) { /* > or }? */
                if( level ) {
                    level--;
                    *dst++ = c; src++;
                    count++;
                } else {
                    /* store the string delimiter unless it is <> */
                    /* v2.08: don't store delimiters for {}-literals */
                    //if (symbol_o != '<')
                    //    *dst++ = c;
                    src++;
                    break; /* exit loop */
                }
#if 1
            /*
             a " or ' inside a <>/{} string? Since it's not a must that
             [double-]quotes are paired in a literal it must be done
             directive-dependant!
             see: IFIDN <">,<">
             */
            } else if( ( c == '"' || c == '\'' ) && ( p->flags2 & DF_STRPARM ) == 0 ) {
                char delim = c;
                char *tdst;
                char *tsrc;
                int tcount;
                *dst++ = c; src++;
                count++;
                tdst = dst;
                tsrc = src;
                tcount = count;
                while (*src != delim && *src != NULLC && count < MAX_STRING_LEN-1 ) {
                    if ( symbol_o == '<' && *src == '!' && *(src+1) != NULLC )
                        src++;
                    *dst++ = *src++;
                    count++;
                }
                if ( *src == delim ) {
                    *dst++ = *src++;
                    count++;
                    continue;
                } else {
                    /* restore values */
                    src = tsrc;
                    dst = tdst;
                    count = tcount;
                }
#endif
            } else if( c == '!' && symbol_o == '<' && *(src+1) ) {
                /* handle literal-character operator '!'.
                 * it makes the next char to enter the literal uninterpreted.
                 */
                /* v2.09: don't store the '!' */
                //*dst++ = c; src++;
                //count++;
                //if ( count == MAX_STRING_LEN )
                //    break;
                src++;
                *dst++ = *src++;
                count++;
            } else if( c == '\\' &&  ConcatLine( src, count, dst, p ) != EMPTY ) {
                p->flags3 |= TF3_ISCONCAT;
            } else if( c == NULLC || ( c == ';' && symbol_o == '{' )) {
                if ( p->flags == TOK_DEFAULT && (( p->flags2 & DF_NOCONCAT ) == 0 ) ) { /* <{ */
                    /* if last nonspace character was a comma
                     * get next line and continue string scan
                     */
                    char *tmp = dst-1;
                    while ( isspace(*tmp) ) tmp--;
                    if ( *tmp == ',' ) {
                        DebugMsg1(("Tokenize.get_string: comma concatenation: %s\n", src ));
                        tmp = GetAlignedPointer( p->output, strlen( p->output ) );
                        if( GetTextLine( tmp ) ) {
                            /* skip leading spaces */
                            while ( isspace( *tmp ) ) tmp++;
                            /* this size check isn't fool-proved yet */
                            if ( strlen( tmp ) + count >= MAX_LINE_LEN ) {
                                EmitError( LINE_TOO_LONG );
                                return( ERROR );
                            }
                            strcpy( src, tmp );
                            continue;
                        }
                    }
                }
                src = p->input;
                dst = p->output;
                *dst++ = *src++;
                count = 1;
                goto undelimited_string;
            } else {
                *dst++ = c; src++;
                count++;
            }
        }
        break;
    default:
        undelimited_string:
        buf->string_delim = NULLC;
        /* this is an undelimited string,
         * so just copy it until we hit something that looks like the end.
         * this format is used by the INCLUDE directive, but may also
         * occur inside the string macros!
         */
        /* v2.05: also stop if a ')' is found - see literal2.asm regression test */
        //for( count = 0 ; count < MAX_STRING_LEN && *src != NULLC && !isspace( *src ) && *src != ',' && *src != ';'; ) {
        for( ; count < MAX_STRING_LEN &&
            /* v2.08: stop also at < and % */
            //*src != NULLC && !isspace( *src ) && *src != ',' && *src != ';' && *src != ')'; ) {
            //*src && !isspace( *src ) && *src != ',' && *src != ')' && *src != '<' && *src != '%'; ) {
            *src && !isspace( *src ) && *src != ',' && *src != ')' && *src != '%'; ) {
            if ( *src == ';' && p->flags == TOK_DEFAULT )
                break;
            /* v2.11: handle '\' also for expanded lines */
            //if (  *src == '\\' && !( p->flags & TOK_NOCURLBRACES ) ) {
            if (  *src == '\\' && ( p->flags == TOK_DEFAULT || ( p->flags & TOK_LINE ) ) ) {
                if ( ConcatLine( src, count, dst, p ) != EMPTY ) {
                    DebugMsg1(("Tokenize.get_string: backslash concatenation: >%s<\n", src ));
                    p->flags3 |= TF3_ISCONCAT;
                    if ( count )
                        continue;
                    return( EMPTY );
                }
            }
            /* v2.08: handle '!' operator */
            if ( *src == '!' && *(src+1) && count < MAX_STRING_LEN - 1 )
                *dst++ = *src++;
            *dst++ = *src++;
            count++;
        }
        break;
    }

    if ( count == MAX_STRING_LEN ) {
        EmitError( STRING_OR_TEXT_LITERAL_TOO_LONG );
        return( ERROR );
    }
    *dst++ = NULLC;
    buf->token = T_STRING;
    buf->stringlen = count;
    p->input = src;
    p->output = dst;
    return( NOT_ERROR );
}

static ret_code get_special_symbol( struct asm_tok *buf, struct line_status *p )
/******************************************************************************/
{
    char    symbol;
    //int     i;

    symbol = *p->input;
    switch( symbol ) {
    case ':' : /* T_COLON binary operator (0x3A) */
        p->input++;
        if ( *p->input == ':' ) {
            p->input++;
            buf->token = T_DBL_COLON;
            buf->string_ptr = "::";
        } else {
            buf->token = T_COLON;
            buf->string_ptr = ":";
        }
        break;
    case '%' : /* T_PERCENT (0x25) */
#if PERCENT_OUT
        /* %OUT directive? */
        if ( ( _memicmp( p->input+1, "OUT", 3 ) == 0 ) && !is_valid_id_char( *(p->input+4) ) ) {
            buf->token = T_DIRECTIVE;
            buf->tokval = T_ECHO;
            buf->dirtype = DRT_ECHO;
            memcpy( p->output, p->input, 4 );
            p->input += 4;
            p->output += 4;
            *(p->output)++ = NULLC;
            break;
        }
#endif
        p->input++;
        if ( p->flags == TOK_DEFAULT && p->index == 0 ) {
            p->flags3 |= TF3_EXPANSION;
            return( EMPTY );
        }
        buf->token = T_PERCENT;
        buf->string_ptr = "%";
        break;
    case '(' : /* 0x28: T_OP_BRACKET operator - needs a matching ')' */
        /* v2.11: reset c-expression flag if a macro function call is detected */
        if ( ( p->flags2 & DF_CEXPR ) && p->index && (buf-1)->token == T_ID ) {
            struct asym *sym = SymSearch( (buf-1)->string_ptr );
            if ( sym && ( sym->state == SYM_MACRO ) && sym->isfunc )
                p->flags2 &= ~DF_CEXPR;
        }
        /* no break */
    case ')' : /* 0x29: T_CL_BRACKET */
    case '*' : /* 0x2A: binary operator */
    case '+' : /* 0x2B: unary|binary operator */
    case ',' : /* 0x2C: T_COMMA */
    case '-' : /* 0x2D: unary|binary operator */
    case '.' : /* 0x2E: T_DOT binary operator */
    case '/' : /* 0x2F: binary operator */
        /* all of these are themselves a token */
        p->input++;
        buf->token = symbol;
        buf->specval = 0; /* initialize, in case the token needs extra data */
        /* v2.06: use constants for the token string */
        buf->string_ptr = (char *)&stokstr1[symbol - '('];
        break;
    case '[' : /* T_OP_SQ_BRACKET operator - needs a matching ']' (0x5B) */
    case ']' : /* T_CL_SQ_BRACKET (0x5D) */
        p->input++;
        buf->token = symbol;
        /* v2.06: use constants for the token string */
        buf->string_ptr = (char *)&stokstr2[symbol - '['];
        break;
    case '=' : /* (0x3D) */
        if ( *(p->input+1) != '=' ) {
            buf->token = T_DIRECTIVE;
            buf->tokval = T_EQU;
            buf->dirtype = DRT_EQUALSGN; /* to make it differ from EQU directive */
            buf->string_ptr = "=";
            p->input++;
            break;
        }
        /* fall through */
    default:
        /* detect C style operators.
         * DF_CEXPR is set if .IF, .WHILE, .ELSEIF or .UNTIL
         * has been detected in the current line.
         * will catch: '!', '<', '>', '&', '==', '!=', '<=', '>=', '&&', '||'
         * A single '|' will also be caught, although it isn't a valid
         * operator - it will cause a 'operator expected' error msg later.
         * the tokens are stored as one- or two-byte sized "strings".
         */
        if ( ( p->flags2 & DF_CEXPR ) && strchr( "=!<>&|", symbol ) ) {
            *(p->output)++ = symbol;
            p->input++;
            buf->stringlen = 1;
            if ( symbol == '&' || symbol == '|' ) {
                if ( *p->input == symbol ) {
                    *(p->output)++ = symbol;
                    p->input++;
                    buf->stringlen = 2;
                }
            } else if ( *p->input == '=' ) {
                *(p->output)++ = '=';
                p->input++;
                buf->stringlen = 2;
            }
            buf->token = T_STRING;
            buf->string_delim = NULLC;
            *(p->output)++ = NULLC;
            break;
        }
        /* v2.08: ampersand is a special token */
        if ( symbol == '&' ) {
            p->input++;
            buf->token = '&';
            buf->string_ptr = "&";
            break;
        }
        /* anything we don't recognise we will consider a string,
         * delimited by space characters, commas, newlines or nulls
         */
        return( get_string( buf, p ) );
    }
    return( NOT_ERROR );
}

#if 0
static void array_mul_add( unsigned char *buf, unsigned base, unsigned num, unsigned size )
/*****************************************************************************************/
{
    while( size-- > 0 ) {
        num += *buf * base;
        *(buf++) = num;
        num >>= 8;
    }
}
#endif

/* read in a number.
 * check the number suffix:
 * b or y: base 2
 * d or t: base 10
 * h: base 16
 * o or q: base 8
 */
static ret_code get_number( struct asm_tok *buf, struct line_status *p )
/**********************************************************************/
{
    char                *ptr = p->input;
    char                *dig_start;
    char                *dig_end;
    unsigned            base = 0;
    unsigned            len;
    uint_32             digits_seen;
    char                last_char;

#define VALID_BINARY    0x0003
#define VALID_OCTAL     0x00ff
#define VALID_DECIMAL   0x03ff
#define OK_NUM( t )     ((digits_seen & ~VALID_##t) == 0)

    digits_seen = 0;
#if CHEXPREFIX
    if( *ptr == '0' && (tolower( *(ptr+1) ) == 'x' ) ) {
        ptr += 2;
        base = 16;
    }
#endif
    dig_start = ptr;
    for( ;; ptr++ ) {
        if (*ptr >= '0' && *ptr <= '9')
            digits_seen |= 1 << (*ptr - '0');
        else {
            last_char = tolower( *ptr );
            if ( last_char >= 'a' && last_char <= 'f' )
                digits_seen |= 1 << ( last_char + 10 - 'a' );
            else
                break;
        }
    }

    /* note that a float MUST contain a dot.
     * 1234e78 is NOT a valid float
     */
    if ( last_char == '.' )
        return( get_float( buf, p ) );

#if 0
    /* v2.08: if suffix isn't followed by a non-id char, don't use it! */
    if ( last_char && is_valid_id_char( *(ptr+1) ) ) {
        last_char = NULLC;
        while ( *(ptr-1) > '9' )
            ptr--;
        digits_seen &= 0x3FF;

    }
#endif

#if CHEXPREFIX
    if ( base != 0 ) {
        dig_end = ptr;
        if ( digits_seen == 0 )
            base = 0;
    } else
#endif
    switch( last_char ) {
    case 'r': /* a float with the "real number designator" */
        buf->token = T_FLOAT;
        buf->floattype = 'r';
        ptr++;
        goto number_done;
    case 'h':
        base = 16;
        dig_end = ptr;
        ptr++;
        break;
    //case 'b':
    case 'y':
        if( OK_NUM( BINARY ) ) {
            base = 2;
            dig_end = ptr;
            ptr++;
        }
        break;
    //case 'd':
    case 't':
        if( OK_NUM( DECIMAL ) ) {
            base = 10;
            dig_end = ptr;
            ptr++;
        }
        break;
    case 'q':
    case 'o':
        if( OK_NUM( OCTAL ) ) {
            base = 8;
            dig_end = ptr;
            ptr++;
        }
        break;
    default:
        last_char = tolower( *(ptr-1) );
        if ( ( last_char == 'b' || last_char == 'd' ) && digits_seen >= ( 1UL << ModuleInfo.radix ) ) {
            char *tmp = dig_start;
            char max = ( last_char == 'b' ? '1' : '9' );
            for ( dig_end = ptr-1; tmp < dig_end && *tmp <= max; tmp++ );
            if ( tmp == dig_end ) {
                base = ( last_char == 'b' ? 2 : 10 );
                break;
            }
        }
        dig_end = ptr;
#if COCTALS
        if( Options.allow_c_octals && *dig_start == '0' ) {
            if( OK_NUM( OCTAL ) ) {
                base = 8;
                break;
            }
        }
#endif
        /* radix      max. digits_seen
         -----------------------------------------------------------
         2            3      2^2-1  (0,1)
         8            255    2^8-1  (0,1,2,3,4,5,6,7)
         10           1023   2^10-1 (0,1,2,3,4,5,6,7,8,9)
         16           65535  2^16-1 (0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f)
         */
        if ( digits_seen < (1UL << ModuleInfo.radix) )
            base = ModuleInfo.radix;
        break;
    }

#if MASMNUMBER
    /* Masm doesn't swallow alphanum chars which may follow the
     * number!
     */
    if ( base != 0 ) {
#else
    if ( base != 0 && is_valid_id_char( *ptr ) == FALSE ) {
#endif
        buf->token = T_NUM;
        buf->numbase = base;
        buf->itemlen = dig_end - dig_start;
        //DebugMsg(("get_number: inp=%s, value=%" I32_SPEC "X\n", p->input, buf->value64 ));
    } else {
        buf->token = T_BAD_NUM;
        DebugMsg(("get_number: BAD_NUMBER (%s), radix=%u, base=%u, ptr=>%s<, digits_seen=%Xh\n", dig_start, ModuleInfo.radix, base, ptr, digits_seen ));
        /* swallow remainder of token */
        while( is_valid_id_char( *ptr ) ) ++ptr;
    }
number_done:
    len = ptr - p->input;
    memcpy( p->output, p->input, len );

    p->output += len;
    *p->output++ = NULLC;
    p->input = ptr;

    return( NOT_ERROR );
}

#if BACKQUOTES
static ret_code get_id_in_backquotes( struct asm_tok *buf, struct line_status *p )
/********************************************************************************/
{
    char *optr = p->output;
    buf->token = T_ID;
    buf->idarg = 0;

    p->input++;         /* strip off the backquotes */
    for( ; *p->input != '`'; ) {
        if( *p->input == NULLC || *p->input == ';' ) {
            *p->output = NULLC;
            EmitErr( BACKQUOTE_MISSING, p->output );
            return( ERROR );
        }
        *optr++ = *p->input++;
    }
    p->input++;         /* skip the terminating '`' */
    *optr++ = NULLC;
    p->output = optr;
    return( NOT_ERROR );
}
#endif

/* get an ID. will always return NOT_ERROR. */

static ret_code get_id( struct asm_tok *buf, struct line_status *p )
/******************************************************************/
{
    //struct ReservedWord *resw;
    char *src = p->input;
    char *dst = p->output;
    int  index;
    unsigned size;

#if CONCATID || DOTNAMEX
continue_scan:
#endif
    do {
        *dst++ = *src++;
    } while ( is_valid_id_char( *src ) );
#if CONCATID
    /* v2.05: in case there's a backslash right behind
     * the ID, check if a line concatenation is to occur.
     * If yes, and the first char of the concatenated line
     * is also a valid ID char, continue to scan the name.
     * Problem: it's ok for EQU, but less good for other directives.
     */
    if ( *src == '\\' ) {
        if ( ConcatLine( src, src - p->input, dst, p ) != EMPTY ) {
            p->concat = TRUE;
            if ( is_valid_id_char( *src ) )
                goto continue_scan;
        }
    }
#endif
#if DOTNAMEX
    /* if the name starts with a dot or underscore, then accept dots
     * within the name (though not as last char). OPTION DOTNAME
     * must be on.
     */
    if ( *src == '.' && ModuleInfo.dotname &&
        ( *(p->output) == '.' || *(p->output) == '_' ) &&
        ( is_valid_id_char(*(src+1)) || *(src+1) == '.' ) )
        goto continue_scan;
#endif
    /* v2.04: check added */
    size = dst - p->output;
    if ( size > MAX_ID_LEN ) {
        EmitErr( IDENTIFIER_TOO_LONG );
        dst = p->output + MAX_ID_LEN;
    }
    *dst++ = NULLC;

    /* now decide what to do with it */

    if( size == 1 && *p->output == '?' ) {
        p->input = src;
        buf->token = T_QUESTION_MARK;
        buf->string_ptr = "?";
        return( NOT_ERROR );
    }
    index = FindResWord( p->output, size );
    if( index == 0 ) {
        /* if ID begins with a DOT, check for OPTION DOTNAME.
         * if not set, skip the token and return a T_DOT instead!
         */
        if ( *p->output == '.' && ModuleInfo.dotname == FALSE ) {
           buf->token = T_DOT;
           buf->string_ptr = (char *)&stokstr1['.' - '('];
           p->input++;
           return( NOT_ERROR );
        }
        p->input = src;
        p->output = dst;
        buf->token = T_ID;
        buf->idarg = 0;
        return( NOT_ERROR );
    }
    p->input = src;
    p->output = dst;
    buf->tokval = index; /* is a enum instr_token value */
    /* v2.11: RWF_SPECIAL now obsolete */
    //if ( ! ( ResWordTable[index].flags & RWF_SPECIAL ) ) {
    if ( index >= SPECIAL_LAST ) {

        //  DebugMsg(("found item >%s< in instruction table, rm=%X\n", buf->string_ptr, InstrTable[index].rm_byte));

        /* if -Zm is set, the following from the Masm docs is relevant:
         *
         * Reserved Keywords Dependent on CPU Mode with OPTION M510
         *
         * With OPTION M510, keywords and instructions not available in the
         * current CPU mode (such as ENTER under .8086) are not treated as
         * keywords. This also means the USE32, FLAT, FAR32, and NEAR32 segment
         * types and the 80386/486 registers are not keywords with a processor
         * selection less than .386.
         * If you remove OPTION M510, any reserved word used as an identifier
         * generates a syntax error. You can either rename the identifiers or
         * use OPTION NOKEYWORD. For more information on OPTION NOKEYWORD, see
         * OPTION NOKEYWORD, later in this appendix.
         *
         * The current implementation of this rule below is likely to be improved.
         */
        if ( ModuleInfo.m510 ) {
            /* checking the cpu won't give the expected results currently since
             * some instructions in the table (i.e. MOV) start with a 386 variant!
             */
            index = IndexFromToken( buf->tokval );
#if 0 /* changed for v1.96 */
            if (( InstrTable[index].cpu & P_EXT_MASK ) > ( ModuleInfo.curr_cpu & P_EXT_MASK )) {
#else
            if (( InstrTable[index].cpu & P_CPU_MASK ) > ( ModuleInfo.curr_cpu & P_CPU_MASK ) ||
                ( InstrTable[index].cpu & P_EXT_MASK ) > ( ModuleInfo.curr_cpu & P_EXT_MASK )) {
#endif
                buf->token = T_ID;
                buf->idarg = 0;
                return( NOT_ERROR );
            }
        }
        buf->token = T_INSTRUCTION;
        return( NOT_ERROR );
    }
    index = buf->tokval;

    /* for RWT_SPECIAL, field <bytval> contains further infos:
     - RWT_REG:             register number (regnum)
     - RWT_DIRECTIVE:       type of directive (dirtype)
     - RWT_UNARY_OPERATOR:  operator precedence
     - RWT_BINARY_OPERATOR: operator precedence
     - RWT_STYPE:           memtype
     - RWT_RES_ID:          for languages, LANG_xxx value
                            for the rest, unused.
     */
    buf->bytval = SpecialTable[index].bytval;

    switch ( SpecialTable[index].type ) {
    case RWT_REG:
        buf->token = T_REG;
        break;
    case RWT_DIRECTIVE:
        buf->token = T_DIRECTIVE;
        if ( p->flags2 == 0 )
            p->flags2 = SpecialTable[index].value;
        break;
    case RWT_UNARY_OP: /* OFFSET, LOW, HIGH, LOWWORD, HIGHWORD, SHORT, ... */
        buf->token  = T_UNARY_OPERATOR;
        break;
    case RWT_BINARY_OP: /* GE, GT, LE, LT, EQ, NE, MOD, PTR */
        buf->token = T_BINARY_OPERATOR;
        break;
    case RWT_STYPE:  /* BYTE, WORD, FAR, NEAR, FAR16, NEAR32 ... */
        buf->token = T_STYPE;
        break;
    case RWT_RES_ID: /* DUP, ADDR, FLAT, VARARG, language types [, FRAME (64-bit)] */
        buf->token = T_RES_ID;
        break;
    default: /* shouldn't happen */
        DebugMsg(("get_id: error, unknown type in SpecialTable[%u]=%u\n", index, SpecialTable[index].type ));
        /**/myassert( 0 );
        buf->token = T_ID;
        buf->idarg = 0;
        break;
    }
    return( NOT_ERROR );
}

/* get one token.
 * possible return values: NOT_ERROR, ERROR, EMPTY.
 *
 * names beginning with '.' are difficult to detect,
 * because the dot is a binary operator. The rules to
 * accept a "dotted" name are:
 * 1.- a valid ID char is to follow the dot
 * 2.- if buffer index is > 0, then the previous item
 *     must not be a reg, ), ] or an ID.
 * [bx.abc]    -> . is an operator
 * ([bx]).abc  -> . is an operator
 * [bx].abc    -> . is an operator
 * varname.abc -> . is an operator
 */

#define is_valid_id_start( ch )  ( isalpha(ch) || ch=='_' || ch=='@' || ch=='$' || ch=='?' )

ret_code GetToken( struct asm_tok token[], struct line_status *p )
/****************************************************************/
{
    if( isdigit( *p->input ) ) {
        return( get_number( token, p ) );
    } else if( is_valid_id_start( *p->input ) ) {
        return( get_id( token, p ) );
    } else if( *p->input == '.' &&
#if DOTNAMEX /* allow dots within identifiers */
              ( is_valid_id_char(*(p->input+1)) || *(p->input+1) == '.' ) &&
#else
              is_valid_id_char(*(p->input+1)) &&
#endif
              /* v2.11: member last_token has been removed */
              //( p->last_token != T_REG &&  p->last_token != T_CL_BRACKET && p->last_token != T_CL_SQ_BRACKET && p->last_token != T_ID ) ) {
              ( p->index == 0 || ( token[-1].token != T_REG && token[-1].token != T_CL_BRACKET && token[-1].token != T_CL_SQ_BRACKET && token[-1].token != T_ID ) ) ) {
        return( get_id( token, p ) );
#if BACKQUOTES
    } else if( *p->input == '`' && Options.strict_masm_compat == FALSE ) {
        return( get_id_in_backquotes( token, p ) );
#endif
    }
    return( get_special_symbol( token, p ) );
}

// fixme char *IfSymbol;        /* save symbols in IFDEF's so they don't get expanded */

static void StartComment( const char *p )
/***************************************/
{
    while ( isspace( *p ) ) p++;
    if ( *p == NULLC ) {
        EmitError( COMMENT_DELIMITER_EXPECTED );
        return;
    }
    ModuleInfo.inside_comment = *p++;
    if( strchr( p, ModuleInfo.inside_comment ) )
        ModuleInfo.inside_comment = NULLC;
    return;
}

int Tokenize( char *line, unsigned int start, struct asm_tok tokenarray[], unsigned int flags )
/*********************************************************************************************/
/*
 * create tokens from a source line.
 * line:  the line which is to be tokenized
 * start: where to start in the token buffer. If start == 0,
 *        then some variables are additionally initialized.
 * flags: 1=if the line has been tokenized already.
 */
{
    int                         rc;
    struct line_status          p;

    p.input = line;
    p.start = line;
    p.index = start;
    //p.last_token = T_FINAL; /* v2.11: last_token is obsolete */
    p.flags = flags;
    p.flags2 = 0;
    p.flags3 = 0;
    if ( p.index == 0 ) {
#ifdef DEBUG_OUT
        cnttok0++;
#endif
        /* v2.06: these flags are now initialized on a higher level */
        //ModuleInfo.line_flags = 0;
        p.output = token_stringbuf;
        if( ModuleInfo.inside_comment ) {
            DebugMsg1(("COMMENT active, delim is >%c<, line is >%s<\n", ModuleInfo.inside_comment, line));
            if( strchr( line, ModuleInfo.inside_comment ) != NULL ) {
                DebugMsg1(("COMMENT mode exited\n"));
                ModuleInfo.inside_comment = NULLC;
            }
            goto skipline;
        }
        /* v2.08: expansion operator % at pos 0 is handled differently.
         */
        //while( isspace( *p.input )) p.input++;
        //if ( *p.input == '%' ) {
        //    *p.input++ = ' ';
        //    expansion = TRUE;
        //}
    } else {
#ifdef DEBUG_OUT
        cnttok1++;
#endif
        p.output = StringBufferEnd;
    }

    for( ;; ) {

        while( isspace( *p.input ) ) p.input++;

        if ( *p.input == ';' && flags == TOK_DEFAULT ) {
            while ( p.input > line && isspace( *(p.input-1) ) ) p.input--; /* skip */
            strcpy( commentbuffer, p.input );
            ModuleInfo.CurrComment = commentbuffer;
            *p.input = NULLC;
        }

        tokenarray[p.index].tokpos = p.input;

        if( *p.input == NULLC ) {
            /* if a comma is last token, concat lines ... with some exceptions
             * v2.05: moved from PreprocessLine(). Moved because the
             * concatenation may be triggered by a comma AFTER expansion.
             */
            if ( p.index > 1 &&
                tokenarray[p.index-1].token == T_COMMA
#if FASTPASS
                && ( Parse_Pass == PASS_1 || UseSavedState == FALSE ) /* is it an already preprocessed line? */
#endif
                && start == 0 ) {
                DebugMsg1(("Tokenize: calling IsMultiLine()\n" ));
                if ( IsMultiLine( tokenarray ) ) {
                    char *ptr = GetAlignedPointer( p.output, strlen( p.output ) );
                    DebugMsg1(("Tokenize: IsMultiLine(%s)=TRUE\n", line ));
                    if ( GetTextLine( ptr ) ) {
                        while ( isspace( *ptr ) ) ptr++;
                        if ( *ptr ) {
                            strcpy( p.input, ptr );
                            if ( strlen( p.start ) >= MAX_LINE_LEN ) {
                                EmitError( LINE_TOO_LONG );
                                p.index = start;
                                break;
                            }
                            DebugMsg1(("Tokenize: line concatenation, line=%s\n", line ));
                            continue;
                        }
                    }
                }
            }
            break;
        }
        tokenarray[p.index].string_ptr = p.output;
        rc = GetToken( &tokenarray[p.index], &p );
        if ( rc == EMPTY )
            continue;
        if ( rc == ERROR ) {
            p.index = start; /* skip this line */
            break;
        }
        /* v2.04: this has been moved here from condasm.c to
         * avoid problems with (conditional) listings. It also
         * avoids having to search for the first token twice.
         * Note: a conditional assembly directive within an
         *    inactive block and preceded by a label isn't detected!
         *    This is an exact copy of the Masm behavior, although
         *    it probably is just a bug!
         */
        if ( !(flags & TOK_RESCAN) ) {
            if ( p.index == 0 || ( p.index == 2 && ( tokenarray[1].token == T_COLON || tokenarray[1].token == T_DBL_COLON) ) ) {
                if ( tokenarray[p.index].token == T_DIRECTIVE &&
                    tokenarray[p.index].bytval == DRT_CONDDIR ) {
                    if ( tokenarray[p.index].tokval == T_COMMENT ) {
                        DebugMsg1(("tokenize: COMMENT starting, delim is >%c<\n", ModuleInfo.inside_comment));
                        StartComment( p.input );
                        break; /* p.index is 0 or 2 */
                    }
                    conditional_assembly_prepare( tokenarray[p.index].tokval );
                    if ( CurrIfState != BLOCK_ACTIVE ) {
                        p.index++;
                        break; /* p.index is 1 or 3 */
                    }
                } else if( CurrIfState != BLOCK_ACTIVE ) {
                    /* further processing skipped. p.index is 0 */
                    break;
                }
            }
        }
        //p.last_token = tokenarray[p.index].token; /* v2.11: last_token is obsolete */
        p.index++;
        if( p.index >= MAX_TOKEN ) {
            DebugMsg1(("tokenize: token index %u >= MAX_TOKEN (=%u), line=>%s<\n", p.index, MAX_TOKEN, line ));
            EmitError( TOO_MANY_TOKENS );
            p.index = start;
            goto skipline;
        }

#if TOKSTRALIGN
        p.output = GetAlignedPointer( token_stringbuf, p.output - token_stringbuf );
#endif

    }

#if TOKSTRALIGN
    p.output = GetAlignedPointer( token_stringbuf, p.output - token_stringbuf );
#endif
    StringBufferEnd = p.output;
skipline:
    tokenarray[p.index].token  = T_FINAL;
    tokenarray[p.index].bytval = p.flags3;
    tokenarray[p.index].string_ptr = "";
    return( p.index );
}