import { Grammar, Tokenizer } from 'earley-parser'
import { AST } from './ast.js'
import SyntacticTypes from './syntactic-types.js'
import { notationStringToArray } from './utilities.js'
import { builtInConcepts } from './built-in-concepts.js'
import { Template } from './template.js'
/**
* The Language class represents one of the languages among which a
* {@link Converter} can convert notation. To add a new language to a
* {@link Converter} instance, just call the constructor of this class, passing
* the {@link Converter} instance. Every Language instance needs a
* {@link Converter} instance to which it belongs, because the purpose of this
* class is to enable conversions among languages, by working together with a
* {@link Converter} instance.
*
* Each instance of this class contains the data passed to its constructor, plus
* a reference to its {@link Converter}, plus a tokenizer and parser for reading
* notation written in the language represented by this instance. After
* creating an instance of this class, you specify the language itself by
* repeated calls to the {@link Language#addNotation addNotation()} function.
* This is necessary in order for the language to have any definition at all,
* before using it to parse text.
*
* Although you can call methods in this class directly to parse text, such as
* {@link Language#parse parse()} and {@link Language#convertTo convertTo()}, it
* is simpler if you instead use the {@link Converter#convert convert()} method
* of the corresponding {@link Converter} instance. Using the {@link Converter}
* will save you from having to deal with intermediate forms like {@link AST
* abstract syntax trees}, but you can create and work with such objects if you
* wish.
*/
export class Language {
/**
* To add a new language to a {@link Converter}, just call this constructor,
* passing the converter object as one of the parameters, and this
* constructor will add this language to that converter.
*
* The default grouping symbols for any newly installed language are `(` and
* `)`. You may wish to override this default for a number of reasons. For
* example, internally, when this class adds the putdown language, it
* specifies the empty array, since putdown uses no grouping symbols. And
* if you were to define a parser for LaTeX, you might want to add the
* symbols `{` and `}` to the list, since they are groupers in LaTeX. Note
* that the array must be of even length, pairs of open and close groupers,
* in that order, as in `[ '(', ')', '{', '}' ]` for LaTeX.
*
* The default linter for any language is the identity function, meaning
* that no cleanup is needed for expressions of that language. If you want
* the {@link Converter#convert convert()} function, upon creating an
* expression in this language, to apply to it any specific formatting
* conventions you would like to see in the output, you can specify a
* linter, which will be run before {@link Converter#convert convert()}
* returns its result. Add such a function only if you see output from
* {@link Converter#convert convert()} that doesn't meet your standards,
* aesthetically or for some functional reason.
*
* For example, when installing putdown as the initial language in the
* constructor for this class, it provides a linter that removes unnecessary
* spaces around parentheses.
*
* @param {String} name - the name of the new language (e.g., "latex")
* @param {Converter} converter - the converter into which to install this
* language
* @param {String[]} groupers - any pairs of grouping symbols used by the
* language (as documented above)
* @param {Function?} linter - a function that cleans up notation in this
* language (as documented above)
* @see {@link Converter#convert convert()}
* @see {@link Converter#language language()}
* @see {@link Converter#isLanguage isLanguage()}
*/
constructor ( name, converter, groupers = [ '(', ')' ], linter = x => x ) {
// Allow clients to pass "null" for groupers:
if ( !groupers ) groupers = [ ]
// Store all parameters:
this.name = name
this.converter = converter
this.groupers = groupers
this.linter = linter
// Also create a tokenizer and grammar and store them:
this.tokenizer = new Tokenizer()
this.tokenizer.addType( /\s/, () => null )
this.grammar = new Grammar()
this.grammar.START = SyntacticTypes.hierarchies[0][0]
// For storing derived concepts' putdown forms:
this.derivedNotation = new Map()
// For each concept in the converter, if it is atomic and has a putdown
// form that is a single regular expression, use that as the default
// notation in the new language; this can be overridden by any later
// call to addNotation(). So we mark it here as the default, so they
// know they can delete it later.
Array.from( converter.concepts.keys() ).forEach( conceptName => {
const concept = converter.concepts.get( conceptName )
if ( SyntacticTypes.isAtomic( concept.parentType ) ) {
if ( concept.putdown instanceof Array ) {
if ( concept.putdown.length != 1 ) return
if ( !( concept.putdown[0] instanceof RegExp ) ) return
}
if ( !( concept.putdown instanceof RegExp ) ) {
const array = notationStringToArray( concept.putdown,
Array.from( Template.defaultVariableNames ) )
if ( array.length != 1 ) return
if ( !( array[0] instanceof RegExp ) ) return
}
this.addNotation( conceptName, concept.putdown )
const rhss = this.rulesFor( conceptName )
rhss[rhss.length-1].putdownDefault = true
}
} )
// Add subtyping rules and grouping rules to the grammar.
// Also, for each atomic type, give it zero rules, to avoid Earley
// throwing an error about the type not existing, if the client doesn't
// choose to include that atomic type in their language.
SyntacticTypes.hierarchies.forEach( hierarchy => {
for ( let i = 0 ; i < hierarchy.length - 1 ; i++ )
this.grammar.addRule( hierarchy[i], hierarchy[i+1] )
if ( groupers.length > 0 && hierarchy[0] == this.grammar.START ) {
const last = hierarchy[hierarchy.length-1]
if ( SyntacticTypes.isAtomic( last ) ) {
for ( let i = 0 ; i < groupers.length - 1 ; i += 2 ) {
const left = groupers[i]
const right = groupers[i+1]
this.addNotation( `Grouped${last}`, `${left} A ${right}` )
}
}
}
const lowest = hierarchy[hierarchy.length-1]
if ( !this.grammar.rules.hasOwnProperty( lowest ) )
this.grammar.rules[lowest] = [ ]
} )
// Register ourselves with our converter
converter.languages.set( name, this )
}
/**
* Add a new notation to this language, for one of its converter's concepts.
* Specify the name of the concept being represented, then the notation
* using a string in which the letters `A`, `B`, and `C` represent the
* first, second, and third arguments, respectively. (You can omit any
* arguments you do not need. For example, you might write `A+B` for
* addition, `-A` for negation, or just `\\bot` for the logical constant
* "false" in LaTeX.)
*
* The options object supports the following fields.
*
* * If you need to use one of the letters `A`, `B`, or `C` in the notation
* itself, or if you need to use more than three parameters in your
* notation (continuing on to `D`, `E`, etc.) then you can use the
* options object to specify the variables in your notation. For
* example, you could use notation `x+y` and then use the options object
* to specify `{ variables : [ 'x', 'y' ] }`. Note that every occurrence
* a variable counts as the variable (except inside another word) even if
* used multiple times. So choose variable names that do not show up
* in the new notation you are introducing.
* * If this notation should be used only for representing the concept in
* this language, but not for parsing from this language into an AST,
* then you can set `writeOnly : true`. This can be useful in two cases.
* 1. If you have multiple notations for the same concept in some
* languages, but not in others. You can map each notation to a
* separate concept, then map all concepts to one notation in the
* smaller language, marking all but one as write-only, thus
* establishing a canonical form. And yet between any two languages
* that support all the notations, translation can preserve the
* notational subtleties.
* 2. If you have some notation that is just a shorthand for a more
* complex notation, you can parse the notation to a concept named
* for that notation, but convert to putdown form in a write-only
* way, expanding the notation to its underlying (compound) meaning.
* Then the converter will not attempt to invert that expansion when
* parsing putdown, but will preserve its expanded meaning.
*
* There are no other options at this time besides those documented above,
* but the options object is available for future expansion.
*
* @param {String} conceptName - the name of the concept represented by this
* new notation
* @param {String} notation - the notation being added
* @param {Object?} options - any additional options, as documented above
*/
addNotation ( conceptName, notation, options = { } ) {
const originalNotation = notation
options = Object.assign( {
variables : Array.from( Template.defaultVariableNames )
}, options )
// ensure concept is valid
const concept = this.converter.concept( conceptName )
if ( !concept )
throw new Error( `Not a valid concept: ${conceptName}` )
// if this is an atomic concept, and the only previous notation it had
// was the putdown default installed at language creation time, then the
// user is now overriding that, so we should delete it.
if ( SyntacticTypes.isAtomic( concept.parentType )
&& this.grammar.rules.hasOwnProperty( conceptName )
&& this.grammar.rules[conceptName].length == 1
&& this.grammar.rules[conceptName][0].putdownDefault )
delete this.grammar.rules[conceptName]
// convert notation to array if needed and extract its tokens
if ( notation instanceof RegExp )
notation = [ notation ]
const notationToPutdown = [ ]
if ( !( notation instanceof Array ) ) {
notation = notationStringToArray( notation, options.variables ).map(
piece => {
const variableIndex = options.variables.indexOf( piece )
if ( variableIndex > -1 ) {
notationToPutdown.push( variableIndex )
return concept.typeSequence[variableIndex]
} else {
return piece
}
}
)
}
const putdownToNotation = notationToPutdown.slice()
notationToPutdown.forEach( ( pi, ni ) => putdownToNotation[pi] = ni )
notation.forEach( piece => {
if ( piece instanceof RegExp ) this.addTokenType( piece )
} )
// add notation to grammar (which can modify the array in-place, so
// it is necessary to do this only after using it to make tokens, above)
let parentType = concept.parentType
// If this notation is write-only, add it to the derived notations list:
if ( options.writeOnly ) {
this.derivedNotation.set( conceptName, notation )
// record in that notation array several data needed for rendering
notation.notationToPutdown = notationToPutdown
notation.putdownToNotation = putdownToNotation
notation.notation = originalNotation
notation.variables = options.variables
// But if it is NOT write-only, add it to the parser:
} else {
// putdown is a special language in which every notation counts as a
// grouper, so everything has high precedence (lowest subtype)
if ( this.name == 'putdown' )
parentType = SyntacticTypes.lowestSubtype( parentType )
this.grammar.addRule( parentType, conceptName )
this.grammar.addRule( conceptName, notation )
const rhss = this.rulesFor( conceptName )
const newRule = rhss[rhss.length - 1]
// record in the new rule RHS several data about how the rule was made
newRule.notationToPutdown = notationToPutdown
newRule.putdownToNotation = putdownToNotation
newRule.notation = originalNotation
newRule.variables = options.variables
}
}
/**
* Treat the given text as an expression in this language and attempt to
* parse it. Return an abstract syntax tree ({@link AST}) on success, or
* `undefined` on failure. Or, if you set the optional second parameter to
* true, it will return an array of all possible parsings, each as an
* {@link AST}.
*
* @param {String} text - the input text to parse
* @param {boolean} ambiguous - if true, return all possible meanings of the
* given text, which will be more than one if the text is ambiguous;
* defaults to false, which returns just one AST or undefined
* @returns {AST|AST[]} - if `ambiguous` is set to false, returns the parsed
* AST, or undefined if parsing failed; if `ambiguous` is set to true,
* returns all parsed ASTs as an array, which may be empty
* @see {@link AST#compact compact()}
*/
parse ( text, ambiguous = false ) {
// Ensure tokenization succeeds, or throw an appropriate error
const tokens = this.tokenizer.tokenize( text )
if ( !tokens ) {
if ( this._debug ) {
console.log( `Tokenizer failed to tokenize "${text}"` )
console.log( this.tokenizer.tokenTypes )
}
return
}
// Find all possible parsings and sort them alphabetically
const all = this.grammar.parse( tokens, {
showDebuggingOutput : this._debug
} )
all.sort( ( a, b ) =>
JSON.stringify( a ).localeCompare( JSON.stringify( b ) ) )
// If they asked for multiple results, return an array (but filter out
// the ones that don't associate correctly).
if ( ambiguous )
return all.map( json => AST.fromJSON( this, json ) )
.filter( ast => ast.associatesCorrectly() )
// They want just one result, so find the first AST that associates
// correctly and return it.
for ( let i = 0 ; i < all.length ; i++ ) {
const ast = AST.fromJSON( this, all[i] )
if ( ast.associatesCorrectly() ) return ast
}
}
/**
* Convert text in this language to text in another language. If the text
* cannot be parsed in this language, then undefined is returned instead.
* Note that this object and `language` must have the same {@link Converter}
* instance associated with them, or this function will throw an error.
*
* @param {String} text - the text in this language to be converter to the
* other language
* @param {Language} language - the destination language
* @param {boolean} ambiguous - passed to the {@link Converter#parse
* parse()} function, and thus determines whether the result of this is a
* string or an array thereof
* @returns {String|String[]} the converted text, if the conversion was
* possible, and undefined otherwise (or an array of strings if
* `ambiguous` is true)
*/
convertTo ( text, language, ambiguous = false ) {
if ( this.converter != language.converter )
throw new Error( 'The two languages do not share a Converter' )
// return one result:
if ( !ambiguous )
return this.parse( text )?.toLanguage( language )
// return multiple results, with duplicates removed, order preserved:
const all = this.parse( text, true )
.map( ast => ast.toLanguage( language ) )
return all.filter( ( item, index ) =>
!all.slice( 0, index ).includes( item ) )
}
/**
* Get all grammar rules for the given concept or syntactic type. The
* result is an array of the right-hand sides of the grammar rules for the
* concept or syntactic type. Each such right-hand side is the array of
* tokens or type names used internally by the parser.
*
* @param {String|AST} target - if this is a string, it must be the name of
* the concept or syntactic type to look up; if it is a leaf AST, then its
* contents as a string are used; if it is a compound AST, then its head
* is used
* @returns {Array} an array of the right-hand sides of the grammar rules
*/
rulesFor ( type ) {
if ( type instanceof AST )
type = type.isCompound() ? type.head().contents : type.contents
const result = this.grammar.rules[type] || [ ]
if ( this.derivedNotation.has( type ) )
result.push( this.derivedNotation.get( type ) )
return result
}
/**
* This static member of the class contains regular expressions for some
* common types of notation. The following regular expressions are
* available to make it easier to define new concepts or notations.
*
* * `oneLetterVariable` - a single letter variable expressed in Roman
* letters (lower-case or upper-case A, B, C, ...)
* * `nonnegativeInteger` - an integer expressed using just the digits 0-9
* * `integer` - same as the previous, but possibly preceded by a `-`
* * `nonnegativeNumber` - a number expressed using the digits 0-9 and a
* decimal point (optional)
* * `number` - same as the previous, but possibly preceded by a `-`
*/
static regularExpressions = {
oneLetterVariable : /[a-zA-Z]/, // can be upgraded later with Greek, etc.
nonnegativeInteger : /\d+/,
integer : /-?\d+/,
nonnegativeNumber : /\.[0-9]+|[0-9]+\.?[0-9]*/,
number : /-?\.[0-9]+|[0-9]+\.?[0-9]*/
}
/**
* Rather than creating an empty language using this class's constructor,
* then adding each notation with a separate function call, you can
* construct an instance and add all the notations in one function call with
* this method.
*
* The format for the JSON data structure passed as the third argument is as
* follows.
*
* * It should have a `"groupers"` field that is an array of strings
* containing the exact same data you would pass as the `groupers`
* argument to the constructor.
* * It should have a `"notations"` field that is an array of objects,
* each object having the fields `"concept"`, `"notation"`, and
* `"options"`, which correspond directly to the three parameters of the
* {@link Language#addNotation addNotation()} function.
*
* To see an example of such a data structure, examine the contents of the
* file {@link latex-notation.js} in this repository.
*
* @param {string} name - the name of the language, just as in this class's
* constructor
* @param {Converter} converter - a {@link Converter} instance, just as in
* this class's constructor
* @param {Object} json - the JSON representation of the language, as
* described above
* @param {boolean} addConceptsAlso - if true, before constructing the
* Language, examine all built-in concepts mentioned in any of its
* notations, and add them to the `converter` using
* {@link Converter#addBuiltIns addBuiltIns()}.
*/
static fromJSON ( name, converter, json, addConceptsAlso = true ) {
if ( addConceptsAlso ) {
// Add all concepts explicitly mentioned in the language
const conceptNames = json.notations.filter( entry => entry.concept )
.map( entry => entry.concept )
converter.addBuiltIns( [ ...new Set( conceptNames ) ] )
// But every language also contains the concepts defined by regular
// expressions, so we have to add them too
const regexpNames = builtInConcepts.filter(
entry => entry.regularExpression ).map( entry => entry.name )
converter.addBuiltIns( regexpNames )
}
const result = new Language( name, converter, json.groupers )
json.notations.forEach( entry => result.addNotation(
entry.concept, entry.notation, entry.options ) )
return result
}
// Internal use only
// Does this language have a token in its tokenizer that is equal to the
// given regular expression?
hasTokenType ( regexp ) {
return this.tokenizer.tokenTypes.find( tokenType =>
tokenType.regexp.source == `^(?:${regexp.source})` )
}
// Internal use only
// Does this language have a token in its tokenizer that is a regular
// expression matching the given string? (Note that we add a $ to test for
// a full-string match.)
hasTokenMatching ( string ) {
return this.tokenizer.tokenTypes.find( tokenType =>
new RegExp( tokenType.regexp.source + '$' ).test( string ) )
}
// Internal use only
// Add a given regular expression to our tokenizer, iff such a token is not
// already present, and then sort the list of tokens in a way that
// prioritizes built-in regular expressions lower (because they are infinite
// sets) and that prioritizes other regular expressions by reverse
// alphabetical order (so that subwords can't be prioritized over full words
// that contain them).
// Note that if the regexp has a property called "originalString", which
// will have been put there by notationStringToArray(), then the regexp was
// designed to match exactly one string. In that case, if any existing
// token matches that same string, then the new token will not be added.
// This prevents bugs like the following: If you define function inverse to
// be f^{-1}, then the "1" doesn't become its own token, borking ordinary
// decimal numbers like 1.5. We make (in notationStringToArray()) an
// exception for this for one-regexp notations, which clearly intend for
// themselves to be tokenized as-is.
addTokenType ( regexp ) {
if ( this.hasTokenType( regexp ) ) return
if ( regexp.hasOwnProperty( 'originalString' )
&& this.hasTokenMatching( regexp.originalString ) ) return
this.tokenizer.addType( regexp )
const lowPriority = Object.keys( Language.regularExpressions )
.map( key => `^(?:${Language.regularExpressions[key].source})` )
const shouldBeLater = re => lowPriority.includes( re.source ) ? 1 : 0
const tokenSort = ( a, b ) => a == '' && b == '' ? 0 :
a.length == 0 ? 1 : // sort Xsuffix < X
b.length == 0 ? -1 : // sort Xsuffix < X
a[0] < b[0] ? -1 :
a[0] > b[0] ? 1 :
tokenSort( a.substring( 1 ), b.substring( 1 ) )
this.tokenizer.tokenTypes.sort( ( a, b ) => {
const aLater = shouldBeLater( a.regexp )
const bLater = shouldBeLater( b.regexp )
if ( aLater != bLater ) return aLater - bLater
a = a.regexp.source
b = b.regexp.source
a = a.substring( 4, a.length - 1 ) // turn ^(?:foo) into foo
b = b.substring( 4, b.length - 1 ) // same
return tokenSort( a, b )
} )
}
}
source