%{

include “linguist.h”

define feed_token(tok, typ) do { \

  yyextra->token = (tok); \
  yyextra->type = (typ); \
} while (0)

define eat_until_eol() do { \

  int c; \
  while ((c = input(yyscanner)) != '\n' && c != EOF && c); \
  if (c == EOF || !c) \
    return 0; \
} while (0)

define eat_until_unescaped(q) do { \

  int c; \
  while ((c = input(yyscanner)) != EOF && c) { \
    if (c == '\n') \
      break; \
    if (c == '\\') { \
      c = input(yyscanner); \
      if (c == EOF || !c) \
        return 0; \
    } else if (c == q) \
      break; \
  } \
  if (c == EOF || !c) \
    return 0; \
} while (0)

%}

%option never-interactive yywrap reentrant nounput warn nodefault header-file=“lex.linguist_yy.h” extra-type=“struct tokenizer_extra *” prefix=“linguist_yy” %x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment

%%

^#![ t]*([[:alnum:]_/]*/)?env([ t]+([^ t=]*=[^ t]*))*[ t]+[[:alpha:]_]+ {

  const char *off = strrchr(yytext, ' ');
  if (!off)
    off = yytext;
  else
    ++off;
  feed_token(strdup(off), SHEBANG_TOKEN);
  eat_until_eol();
  return 1;
}

^#![ t]*[[:alpha:]_/]+ {

  const char *off = strrchr(yytext, '/');
  if (!off)
    off = yytext;
  else
    ++off;
  if (strcmp(off, "env") == 0) {
    eat_until_eol();
  } else {
    feed_token(strdup(off), SHEBANG_TOKEN);
    eat_until_eol();
    return 1;
  }
}

^[ t]*(//|–|#|%|")“ ”.* { nothing }

“ ” { BEGIN(c_comment); }

/* See below for xml_comment start.

“{-” { BEGIN(haskell_comment); } “(*” { BEGIN(ocaml_comment); } “"""” { BEGIN(python_dcomment); } “'''” { BEGIN(python_scomment); }

<c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|n { /* nothing */ } <c_comment>“*/” { BEGIN(INITIAL); } <xml_comment>“–>” { BEGIN(INITIAL); } <haskell_comment>“-}” { BEGIN(INITIAL); } <ocaml_comment>“*)” { BEGIN(INITIAL); } <python_dcomment>“"""” { BEGIN(INITIAL); } <python_scomment>“'''” { BEGIN(INITIAL); }

""|'' { /* nothing */ } " { eat_until_unescaped('“'); } ' { eat_until_unescaped('''); } (0x([0-9a-fA-F]|.)*|([0-9]|.)*)([lL]{0,2}|([eE][0-9]*)?*) { /* nothing */ } <[[:alnum:]_!./?-]+ {

  if (strcmp(yytext, "<!--") == 0) {
   BEGIN(xml_comment);
  } else {
    feed_token(strdup(yytext), SGML_TOKEN);
    BEGIN(sgml);
    return 1;
  }
}

<sgml>[_]+=" { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('“'); return 1; } <sgml>[_]+=' { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('''); return 1; } <sgml>[[:alnum:]_]+=[_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; } <sgml>[_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } <sgml>> { BEGIN(INITIAL); } <sgml>.|n { /* nothing */ } ;|{|}|(|)|[|] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } [[:alnum:]_.@#/*]+ {

  if (strncmp(yytext, "/*", 2) == 0) {
    if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
      /* nothing */
    } else {
      BEGIN(c_comment);
    }
  } else {
    feed_token(strdup(yytext), REGULAR_TOKEN);
    return 1;
  }
}

<<?|+|-|*|/|%|&&?|||? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } .|n { /* nothing */ }

%%