Writing parsers for your applications

Somewhere in the life of a programmer, you’ll encounter the need to parse data or even worse, parse data from a from a format that is not a standard like json, csv or xml.

For most such use cases writing a text processing script with python is possible. But what if you want to create your own data format, or even better your own programming language?

Writing a CSS transpiler

Transpilers are programs that convert one language into another. Many programming languages use compilers that are internally transpilers for some type of object code.

In this example we’ll take a look at at transpiler that converts from a super-dialect of CSS to plain css. Let’s call it PSS.

What we’ll need

•Flex/Bison and a C/C++ compiler like gcc/g++
•Basic understanding of compiler design

Target grammar

Mainly I wanted the language to implement inheritance. Imagine if a css block could inherit all the styles of another block. That plus a few more features like aliasing an existing class name can be implemented.

Keep in mind that this is a hobby project that I use regularly for personal tools. But if you want an industry standard alternative, refer Sass.

This is what a sample pss file looks like.

$alert_padding = 5px;
color1 = [#03AED2,#FDDE55];

body .flex{
	background-color:red;
}

.flex-container {
  display: flex;
  flex-wrap: wrap;
  flex-direction: row;
  justify-content: start;
  align-items: auto;
  align-content: start;
  background-color:color1;
}

.flex-container alias .unflexed-container;

.flex-item {
  flex: 0 0 auto;
  margin: 10px;
}


.sidebar extends .flex-item{
   /*testing comment*/
   !padding-top : 10px;
   padding-bottom : 10px;
   height: 100vh;
 }

 /* small header */
.alert.alert-warning {
   padding-top: calc(alert_padding + 21px);
   padding-bottom: alert_padding;
}

/* items list */
.nav-pills .nav-link
.nav-pills .nav-link extends .sidebar .flex-container{
  width:100%;
}

.card .card-body {
  padding-top: 2px; 
  padding-bottom: 2px; 
}

.btn-link{
  text-decoration: none;
}

This is the css our tranpiler generates

.flex
body{
	background-color : red;
}
.flex-container{
	display : flex;
	flex-wrap : wrap;
	flex-direction : row;
	justify-content : start;
	align-items : auto;
	align-content : start;
	background-color : var(--color1);
}
.unflexed-container{
	display : flex;
	flex-wrap : wrap;
	flex-direction : row;
	justify-content : start;
	align-items : auto;
	align-content : start;
	background-color : var(--color1);
}
.flex-item{
	flex : 0 0 auto;
	margin : 10px;
}
.sidebar{
	flex : 0 0 auto;
	margin : 10px;
/*testing comment*/
	padding-top : 10px !important ;
	padding-bottom : 10px;
	height : 100vh;
}
/* small header */
.alert-warning
.alert{
	padding-top : calc(5px + 21px);
	padding-bottom : 5px;
}
/* items list */
.nav-link
.nav-pills
.nav-link
.nav-pills{
	display : flex;
	flex-wrap : wrap;
	flex-direction : row;
	justify-content : start;
	align-items : auto;
	align-content : start;
	background-color : var(--color1);
	flex : 0 0 auto;
	margin : 10px;
	padding-top : 10px;
	padding-bottom : 10px;
	height : 100vh;
	width : 100%;
}
.card-body
.card{
	padding-top : 2px;
	padding-bottom : 2px;
}
.btn-link{
	text-decoration : none;
}
:root{
	--color1:#03AED2;
}

As you can see the language supports inheritance via the ‘extends’ keyword, class name aliasing, variables and randomized values for variables (as you’ll see in the ‘color1’ property).

Writing the tokenizer

The tokenizer will scan through the target code and understand the ‘tokens’ that comprise the grammar. A token could be anything – a semicolon, a keyword, opening or closing braces. Anything that is quantifiably the smallest unit in a grammar.

Below is the flex program to implement the scanner

%{
#include<string.h>
#include<iostream>
#include<stdbool.h>
#include "y.tab.h"

#define RETURN_IF_INVALID(x)\
do{\
	int _ret = (x);\
	if(_ret!=-1)\
		return _ret;\
}while(0)


char id_buffer[256]= "\0";
bool commented = false;

int is_keyword(){
	if(strcmp(yytext,"extends")==0) {return EXTENDS;}
	if(strcmp(yytext,"alias")==0) {return ALIAS;}
	return -1;
}
int parse_comment(){
	if(commented){
		yylval.string = strdup(yytext); 
		//std::cout<< "comment detected was "<<yytext;
		return COMMENT;
	}
	return -1;
}

int yylex(void);
%}

IDENTIFIER [A-Za-z_]+[A-Za-z_0-9]*
WHITESPACE [ \t\n]
COMMENT_BEGIN "/*"
COMMENT_END "*/"

%x property
%option noyywrap

%%
\{      				{ RETURN_IF_INVALID(parse_comment()); return OPENING_BRACES;}
\}      				{ RETURN_IF_INVALID(parse_comment()); return CLOSING_BRACES;}
{IDENTIFIER}(-[A-Za-z_0-9]+)*/{WHITESPACE}*: 		{ RETURN_IF_INVALID(parse_comment());yylval.string = strdup(yytext);return PROPERTY_NAME;}
{IDENTIFIER}		{ 
					RETURN_IF_INVALID(parse_comment());
					int keyword = is_keyword();
					if(keyword != -1) {return keyword;}
					else { 
						yylval.string = strdup(yytext);
						return IDENTIFIER;
					}
					/*this idiom up above works for 'anything else a.k.a .*'*/
					}
\.{IDENTIFIER}(-[A-Za-z_0-9]+)* 	{ RETURN_IF_INVALID(parse_comment()); yylval.string = strdup(yytext);return CLASS_NAME;}
{COMMENT_BEGIN}				{ commented=true; printf("\n");	ECHO; }
{COMMENT_END}				{ commented=false;		ECHO; }
;					{ RETURN_IF_INVALID(parse_comment()); return PROPERTY_SEPARATOR; }
\[					{ RETURN_IF_INVALID(parse_comment()); return OPENING_BRACES; }
\]					{ RETURN_IF_INVALID(parse_comment()); return CLOSING_BRACES; }
:					{ RETURN_IF_INVALID(parse_comment()); return PAIR_SEPARATOR; }
{WHITESPACE} 				{ RETURN_IF_INVALID(parse_comment()); return WHITESPACE;}
= 					{ RETURN_IF_INVALID(parse_comment()); return ASSIGN;}
\$ 					{ RETURN_IF_INVALID(parse_comment()); return SCALAR;}
!					{ RETURN_IF_INVALID(parse_comment()); return IMP_MARKER;}
.					{ RETURN_IF_INVALID(parse_comment());yylval.string = strdup(yytext); return TEXT; }
%%

Writing the parser rules

Once we have written the tokenizer grammar, we need to write the rules for the language. Forgetting advanced decorators, the rules for CSS are pretty straightforward – especially since we don’t have to deal with recursive constructs like loops or conditional blocks.

In this example the bison program also handles the code generation process (which should ideally be handled in a separate source file)

Below is the bison program to implement the parser

%{
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <string>
#include <sstream>
#include <unordered_map>
#include <vector>
#include <numeric>
#include <iostream>
#include <random>

#define set_is_var(x) is_var.push_back(x)

char s_buffer[256];
bool extends_mode_on = false;
bool active_class_extends = false;

std::unordered_map<std::string,std::string> variables;
std::vector<bool> is_var;

std::unordered_map<std::string,std::vector<std::string>> classes;
std::vector<std::string> parent_classes;
bool marked_important = false;

char active_class[256];

void register_class(const char* s){
	if(extends_mode_on){
		parent_classes.push_back(std::string(s));
	}else{
		printf("\n%s",s);
		strcpy(active_class,s);
	}
}

void output_properties(){
	extends_mode_on = false;
	if(active_class_extends){
		for(auto p: parent_classes){
			for(auto s: classes[p]){
				std::cout<< s <<";";
				classes[active_class].push_back(s);
			}
		}
		//reset
		active_class_extends = false;
		parent_classes.clear();
	}
}
const int random_index(size_t limit=0) {
        std::random_device rd;
        std::mt19937 gen(rd());
        std::uniform_int_distribution<> dis(0, limit);

        // Generate a random index
        return  dis(gen);
}

const char* select_random_from_values(const std::string& input) {
    std::vector<std::string> tokens;
    std::string token;
    std::stringstream ss(input);

    while (std::getline(ss, token, ',')) {
        tokens.push_back(token);
    }

    return tokens[random_index(tokens.size()-1)].c_str();
}

void yyerror(const char* s);
int yylex(void);

%}

%define parse.error custom

%union{
    char* string;
}

%token OPENING_BRACES
%token CLOSING_BRACES
%token PROPERTY_SEPARATOR
%token PAIR_SEPARATOR
%token ASSIGN
%token EXTENDS
%token ALIAS
%token SCALAR
%token WHITESPACE
%token IMP_MARKER

%token <string> PROPERTY_NAME
%token <string> IDENTIFIER
%token <string> CLASS_NAME
%token <string> TEXT
%token <string> COMMENT

%nterm <string> valid_value
%nterm <string> array

%%

css: opt_whitespace css 
	| variable css 
	| class_definition css
	| comment css  
	| aliasing css
	| class_definition
	| comment 
	| aliasing
	;

aliasing: CLASS_NAME opt_whitespace ALIAS opt_whitespace CLASS_NAME opt_whitespace PROPERTY_SEPARATOR 
	{
		register_class($5);
		std::cout<<"{";
		extends_mode_on = true;
		register_class($1);
		active_class_extends = true;
		output_properties();
		std::cout<<"\n}";
	};

class_definition:  selectors opt_extends_class_names open_brace class_body close_brace opt_whitespace 
		|  CLASS_NAME opt_whitespace PROPERTY_SEPARATOR opt_whitespace {std::cout<<"\n"<<$1<<";";}
		;

class_body: opt_whitespace 
	| comment 
	| property_pairs 
	| class_body opt_whitespace 
	| class_body comment
	| class_body property_pairs 
	;
	;

array:	OPENING_BRACES valid_value CLOSING_BRACES {$$=strdup(select_random_from_values($2));};

variable: SCALAR IDENTIFIER opt_whitespace ASSIGN opt_whitespace valid_value opt_whitespace PROPERTY_SEPARATOR {variables[$2]=std::string($6);set_is_var(true);}
        |        IDENTIFIER opt_whitespace ASSIGN opt_whitespace valid_value opt_whitespace PROPERTY_SEPARATOR {variables[$1]=std::string($5);set_is_var(false);}
        |        IDENTIFIER opt_whitespace ASSIGN opt_whitespace array opt_whitespace PROPERTY_SEPARATOR {variables[$1]=std::string($5);set_is_var(false);}
	;

open_brace: 	OPENING_BRACES {
		  	printf("{");
			output_properties();
		};

close_brace: 	CLOSING_BRACES {printf("\n}");};

selectors: class_names
	 | IDENTIFIER opt_whitespace { printf("\n%s",$1);}
	 | IDENTIFIER opt_whitespace selectors { printf("\n%s",$1);}
	 ;

class_names: CLASS_NAME 				{register_class($1);}
	   | CLASS_NAME opt_whitespace 			{register_class($1);}
	   | CLASS_NAME opt_whitespace class_names 	{register_class($1);}
	   ;

opt_extends_class_names: EMPTY 
		        | extends opt_whitespace class_names 
			;

extends: EXTENDS {extends_mode_on = true;active_class_extends = true;}
 
property_pairs: property_pair opt_whitespace property_separator opt_whitespace
	      | property_pair opt_whitespace 
	      | property_pair opt_whitespace property_separator opt_whitespace property_pairs opt_whitespace 
	      ;

property_pair: opt_important PROPERTY_NAME opt_whitespace PAIR_SEPARATOR opt_whitespace valid_value 
	     {
		std::string t = "\n\t"+std::string($2)+" : "+std::string($6);
		std::cout<< t;
		if(marked_important)
			printf(" !important ");
		classes[active_class].push_back(t);
	     };

valid_value: TEXT 			{$$=$1;}
           | IDENTIFIER 		{auto f_iter= variables.find($1);if(f_iter!=variables.end()) {
						if(is_var[std::distance(variables.begin(),f_iter)]){
							char t[256];
							sprintf(t,"var(--%s)",$1);
							strcpy($$,t);
							//std::cout<< "\n$$-ii: "<<$$;
						}else{
							$$=strdup(variables[$1].c_str());
							//std::cout<< "\n$$-ie: "<<$$;

						}
					}}
           | valid_value TEXT 		{$$=strcat($$,$2);}
           | valid_value IDENTIFIER 	{auto f_iter= variables.find($2);if(f_iter!=variables.end()) {
						if(is_var[std::distance(variables.begin(),f_iter)]){
							char t[256];
							sprintf(t,"var(--%s)",$2);
							$$ = strcat($1,t);
							//std::cout<< "\n$$e-ii: "<<$$;
						}else{
							char t[256];
							strcpy(t,strdup(variables[$2].c_str()));
							$$ = strcat($$,t);
							//std::cout<< "\n$$e-ie: "<<$$;
						}
					}else{
						$$=strcat($$,$2);
						//std::cout<< "\n$$e-e: "<<$$;
					}
					}
           | valid_value WHITESPACE 	{$$=strcat($$," ");}
           ;

opt_important: EMPTY 	  {marked_important = false;}
	     | IMP_MARKER {marked_important = true;};
property_separator : PROPERTY_SEPARATOR {printf(";");};

EMPTY: /*empty*/;
opt_whitespace: EMPTY 
	      | WHITESPACE opt_whitespace 
	      ;
		
comment: COMMENT {printf("%s",$1);}

%%

void finally(){
	//print :root block
	bool any_vars = std::accumulate(is_var.begin(), is_var.end(), false, [](bool a,bool b){return a or b;});
	if(any_vars){
		std::cout<< "\n:root{";
		uint i = 0;
		for(const auto& [k,v]: variables){
			if (is_var[i])
				std::cout<< "\n\t--"<<k<<":"<<v<<";";
			i++;
		}
		std::cout<< "\n}";
	}
}

int yywrap(void) {return 1;}

void yyerror(const char* s) {
    printf("\nError: %s\n", s);
}

static int yyreport_syntax_error (const yypcontext_t *ctx) { 
	int res = 0; 
	YY_LOCATION_PRINT (stderr, *yypcontext_location (ctx));
	fprintf (stderr, ": syntax error");
	 // Report the tokens expected at this point. 
	{ 
		enum { TOKENMAX = 5 };
		yysymbol_kind_t expected[TOKENMAX];
		int n = yypcontext_expected_tokens (ctx, expected, TOKENMAX);
		if (n < 0) // Forward errors to yyparse. 
			res = n; 
		else
			for (int i = 0; i < n; ++i) 
				fprintf (stderr, "%s %s", i == 0 ? ": expected" : " or", yysymbol_name (expected[i])); 
	} // Report the unexpected token. 
	{ 
		yysymbol_kind_t lookahead = yypcontext_token (ctx); 
		if (lookahead != YYSYMBOL_YYEMPTY) 
		fprintf (stderr, " before %s at ", yysymbol_name (lookahead)); 
		YY_LOCATION_PRINT (stderr, *yypcontext_location (ctx));
	} 
		fprintf (stderr, "\n"); 
		return res;
}
int main() {
    yyparse();
    finally();
    return 0;
}

Makefile

Below is the Makefile to build the project. Just execute the ‘make’ command, that is if you are compiling on linux / WSL. If not, manually executing the ‘generate’ and ‘compile’ commands should work.

all: generate compile
generate:
	flex pss.l && bison -t -dy pss.y
compile:
	g++ -std=c++17 lex.yy.c y.tab.c -o pss
clean:
	rm ./pss
	rm y.tab.c y.tab.h
	rm lex.yy.c 
git-ignore-fix:
	git rm -r --cached .;
	git add .;
	git commit -m "Untracked files issue resolved to fix .gitignore";

Testing

Finally we can test our program by executing the command

Contents