lib/parser/lexer.js

const tokenRegex = require("./tokenRegex.js");
const fs = require("fs");

class Lexer {
  constructor() {
    this.lastIndex = 0;
    this.line = 1;
  }

  setProgram(program) {
    this.program = program;

    this.lastIndex = 0;
    this.line = 1;

    return this;
  }

  tokenize(program) {
    this.setProgram(program);

    return this._getTokens();
  }

  static tokenize(program) {
    return new Lexer().tokenize(program);
  }

  tokenizeFromFile(file) {
    const program = fs.readFileSync(file, "utf-8");

    return this.tokenize(program);
  }

  static tokenizeFromFile(file) {
    return new Lexer().tokenizeFromFile(file);
  }

  static setTokens(tokens) {
    Lexer.TOKENS = tokens;
  }

  getParBalance(program) {
    this.setProgram(program);

    let stack = 0;
    const tokens = this._getTokens();
    for (const token of tokens) {
      if (token.type === "LP") {
        ++stack;
      } else if (token.type === "RP") {
        --stack;
      }
    }

    return stack;
  }

  _getTokens() {
    const tokens = [];

    let currentToken = this._getToken();
    while (currentToken !== null) {
      tokens.push(currentToken);

      currentToken = this._getToken();
    }

    return this.__transformTokens(tokens);
  }

  _getToken() {
    // Update lastIndex property for each expression used
    this.__updateLastIndices();

    // Match and ignore whitespaces and newlines
    let whitespaces = Lexer.WHITES.exec(this.program);
    if (whitespaces !== null) {
      if (Lexer.NEWLINE.exec(whitespaces.value)) {
        ++this.line;
      }
      this.lastIndex = Lexer.WHITES.lastIndex;

      return this._getToken();
    }

    // Iterate through each regex
    let token = null;
    for (let i = 0; i < Lexer.TOKENS.length; ++i) {
      let match = Lexer.TOKENS[i].exec(this.program);

      // When matching a expression, add the current line and set the token
      if (match !== null) {
        match.line = this.line;
        this.lastIndex = Lexer.TOKENS[i].lastIndex;

        token = match;
        break;
      }
    }

    // Return the matched token, or null if anything was found
    return token;
  }

  __transformTokens(tokens) {
    for (let i = 0; i < tokens.length; ++i) {
      // x: => "x",
      if (tokens[i].type === "WORD") {
        const nextToken = tokens[i + 1];
        if (nextToken && nextToken.value === ":") {
          tokens[i].type = "STRING";
        }
      }

      // Replace dots with parentehsis
      // a.b   =>    a("b")
      // a.b.c   =>  a("b")("c")
      // a.b(c, d)   => a("b", c, d)
      if (tokens[i].type === "LP" && tokens[i].value === ".") {
        tokens[i].value = "(";

        const expr = tokens[i + 1];
        const arg = tokens[i + 2];

        if (expr && expr.type === "WORD") {
          expr.type = "STRING";
        }

        if (arg && arg.type === "LP" && arg.value !== ".") {
          arg.type = "COMMA";
          arg.value = ",";
        } else {
          tokens.splice(i + 2, 0, { type: "RP", value: ")" });
        }
      }
    }

    return tokens;
  }

  __updateLastIndices() {
    Lexer.TOKENS.forEach(expr => {
      expr.lastIndex = this.lastIndex;
    });
    Lexer.WHITES.lastIndex = this.lastIndex;
    Lexer.NEWLINE.lastIndex = this.lastIndex;
  }
}

Lexer.TOKENS = [
  tokenRegex.NUMBER,
  tokenRegex.STRING,
  tokenRegex.REGEX,
  tokenRegex.WORD,
  tokenRegex.LP,
  tokenRegex.RP,
  tokenRegex.COMMA
];

Lexer.WHITES = tokenRegex.WHITES;
Lexer.NEWLINE = tokenRegex.NEWLINE;

module.exports = {
  Lexer
};