%{
include “linguist.h”
define feed_token(tok, typ) do { \
yyextra->token = (tok); \ yyextra->type = (typ); \ } while (0)
define eat_until_eol() do { \
int c; \ while ((c = input(yyscanner)) != '\n' && c != EOF && c); \ if (c == EOF || !c) \ return 0; \ } while (0)
define eat_until_unescaped(q) do { \
int c; \ while ((c = input(yyscanner)) != EOF && c) { \ if (c == '\n') \ break; \ if (c == '\\') { \ c = input(yyscanner); \ if (c == EOF || !c) \ return 0; \ } else if (c == q) \ break; \ } \ if (c == EOF || !c) \ return 0; \ } while (0)
%}
%option never-interactive yywrap reentrant nounput warn nodefault header-file=“lex.linguist_yy.h” extra-type=“struct tokenizer_extra *” prefix=“linguist_yy” %x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
%%
^#![ t]*([[:alnum:]_/]*/)?env([ t]+([^ t=]*=[^ t]*))*[ t]+[[:alpha:]_]+ {
const char *off = strrchr(yytext, ' '); if (!off) off = yytext; else ++off; feed_token(strdup(off), SHEBANG_TOKEN); eat_until_eol(); return 1; }
^#![ t]*[[:alpha:]_/]+ {
const char *off = strrchr(yytext, '/'); if (!off) off = yytext; else ++off; if (strcmp(off, "env") == 0) { eat_until_eol(); } else { feed_token(strdup(off), SHEBANG_TOKEN); eat_until_eol(); return 1; } }
^[ t]*(//|–|#|%|")“ ”.* { nothing }
“ ” { BEGIN(c_comment); }
/* See below for xml_comment start.
“{-” { BEGIN(haskell_comment); } “(*” { BEGIN(ocaml_comment); } “"""” { BEGIN(python_dcomment); } “'''” { BEGIN(python_scomment); }
<c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|n { /* nothing */ } <c_comment>“*/” { BEGIN(INITIAL); } <xml_comment>“–>” { BEGIN(INITIAL); } <haskell_comment>“-}” { BEGIN(INITIAL); } <ocaml_comment>“*)” { BEGIN(INITIAL); } <python_dcomment>“"""” { BEGIN(INITIAL); } <python_scomment>“'''” { BEGIN(INITIAL); }
""|'' { /* nothing */ } " { eat_until_unescaped('“'); } ' { eat_until_unescaped('''); } (0x([0-9a-fA-F]|.)*|([0-9]|.)*)([lL]{0,2}|([eE][0-9]*)?*) { /* nothing */ } <[[:alnum:]_!./?-]+ {
if (strcmp(yytext, "<!--") == 0) { BEGIN(xml_comment); } else { feed_token(strdup(yytext), SGML_TOKEN); BEGIN(sgml); return 1; } }
<sgml>[_]+=" { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('“'); return 1; } <sgml>[_]+=' { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('''); return 1; } <sgml>[[:alnum:]_]+=[_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; } <sgml>[_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } <sgml>> { BEGIN(INITIAL); } <sgml>.|n { /* nothing */ } ;|{|}|(|)|[|] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } [[:alnum:]_.@#/*]+ {
if (strncmp(yytext, "/*", 2) == 0) { if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) { /* nothing */ } else { BEGIN(c_comment); } } else { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } }
<<?|+|-|*|/|%|&&?|||? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } .|n { /* nothing */ }
%%