Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <fstream>
- #include <regex>
- #include <vector>
- #include <string>
- #include <iomanip>
- using namespace std;
- /*Detects and merges binary operators*/
- smatch bin_ops_merger (vector<string>&, size_t);
- /*Combines two vectors into a pair*/
- template<typename T,typename U>
- vector<pair<T,U>> merge_vectors (const vector<T>&, const vector<U>&);
- /*Assigns Matching Tokens to Lexemes*/
- void tokenizer (vector<string>&, vector<pair<string,string>>&);
- int main(int argc, char** argv) {
- //store full source file in a string
- string file, line;
- ifstream ifs("analyzethis.file");
- while (getline(ifs, line, '\0')){
- file+= line;
- }
- ifs.close();
- /*Strip single and multi line comments*/
- regex rexComments("(//.*)|(/\\*(?:.|[\\n\\r])*?\\*/)");
- string result;
- regex_replace(std::back_inserter(result), file.begin(), file.end(), rexComments, " ");
- file= result;
- /* REGEX Patterns:
- * Not alphanumeric [\\W]
- * number: (\\d+).(\\d+)
- * string: (\".*\")
- */
- regex rexPtrn("[\\W]|(\\d+).(\\d+)|(\".*\")");
- //only alpha numeric, reverses negation
- regex_token_iterator<string::iterator> rtiNS(file.begin(), file.end(), rexPtrn, -1);
- //only symbols
- regex_token_iterator<string::iterator> rtiS(file.begin(), file.end(), rexPtrn);
- //end of line comparison
- regex_token_iterator<string::iterator> rtiEnd;
- vector<string> vData; //holds token
- while ((rtiNS!=rtiEnd)&&(rtiS!=rtiEnd)){
- if((*rtiNS).length()>0) //if not symbol
- vData.push_back(*rtiNS);
- if((*rtiS).length()>0&&*rtiS!=" "&&*rtiS!="\t"&&*rtiS!="\n")
- vData.push_back(*rtiS);
- //advance iterators
- ++rtiNS;
- ++rtiS;
- }
- //FIND BINARY OPERATORS AND COMBINE THEM
- for (size_t ctr= 0; ctr < vData.size() - 1; ++ctr) {
- bin_ops_merger(vData, ctr);
- }
- /*Holds final matches*/
- vector<pair<string,string>> vTokenLexeme;
- tokenizer(vData, vTokenLexeme);
- //Save tokenized lexemes
- ofstream ofs("tokenized.txt");
- ofs<<string(30, '=')<<endl;
- ofs<<setw(17)<<right<<"Lexeme ||"<<setw(10)<<"Token"<<endl;
- ofs<<string(30, '=')<<endl;
- for(auto it : vTokenLexeme){
- ofs<<setw(15)<<left<<it.first<<"|| "<<it.second<<endl;
- }
- ofs.close();
- // system("PAUSE");
- return 0;
- }
- /*Combines two vectors into a pair*/
- template<typename T,typename U>
- vector<pair<T,U>> merge_vectors (const vector<T>& v1, const vector<U>& v2) {
- vector<pair<string, string>> vOut;
- for(size_t i= 0; i< v1.size(); ++i){
- vOut.emplace_back(v1.at(i), v2.at(i));
- }
- return vOut;
- }
- /*Detects and merges binary operators*/
- smatch bin_ops_merger (vector<string>& vData, size_t ctr) {
- regex binaryOperatorsPattern("\\+=|-=|\\*=|/=|%=|&=|\\!=|==|\\|=|\\^="
- "|<=|>=|--|\\+\\+|<<|>>|&&|\\|\\||->");
- vector <string>::iterator curr, next;
- curr= vData.begin()+ctr;
- next= vData.begin()+ctr+1;
- string str= *curr+*next; //run regex pattern on this string
- smatch binOpsMatch; //stores the matched partion
- if (regex_match(str, binOpsMatch, binaryOperatorsPattern)) {
- *curr= *curr + *next; //merge operators
- vData.erase(next); //delete extra element
- }
- return binOpsMatch;
- }
- /*Assigns Matching Tokens to Lexemes*/
- void tokenizer (vector<string>& vLex, vector<pair<string,string>>& vTknLex) {
- //Reserved Key Words
- vector<string> vKword;
- vKword={"string","include","auto","const","struct","unsigned","break",
- "continue","else","for","signed","switch","void","case","default",
- "enum","goto","register","sizeof","typedef","volatile","char","do",
- "extern","if","return","static","union","while","asm","dynamic_cast",
- "namespace","reinterpret_cast","try","bool","explicit","new","template",
- "static_cast","typeid","catch","false","operator","typename","public",
- "class","friend","private","this","using","const_cast","inline","throw",
- "virtual","delete","mutable","protected","true","elseif"};
- vector<string> vDataTypes;
- vDataTypes={"double","float","int","short","size_t","long","string"};
- //Binary Operators
- vector<string> vbotkn, vbolex;
- vbotkn={"+=", "-=", "*=", "/=", "%=", "&=", "!=", "==", "|=", "^=", "<=",
- ">=","--", "++", "<<", ">>", "&&", "||", "->",":"};
- vbolex={"ADD_ASSIGN","SUB_ASSIGN","MUL_ASSIGN","DIV_ASSIGN","MOD_ASSIGN",
- "AND_ASSIGN","LOGIC_INEQ","LOGIC_EQ","OR_ASSIGN","XOR_ASSIGN",
- "LESS_OR_EQ","MORE_OR_EQ","DECREMENT","INCREMENT","INSERTION",
- "EXTRACTION","LOGIC_AND","LOGIC_OR","MEMBER_PTR","SCOPE_RES"};
- vector<pair<string,string>> vBOpsTokens= merge_vectors(vbotkn, vbolex);
- //Unary Symbols
- vector<string> vsymtkn, vsymlex;
- vsymtkn={".","#",",","=","-","+","/","*","%","(",")","{","}","[","]","~",
- "^","|","&","?",":",";","!",">","<"};
- vsymlex={"MEMBER_OBJ","PREPROC","SEPARATOR","ASSIGN","SUB","ADD","DIV",
- "MUL_OR_DEREF","MOD","L_PAREN","R_PAREN","L_BRACE","R_BRACE","L_BRACKET",
- "R_BRACKET","COMPLEMENT","XOR","OR","AND","CONDITIONAL","COND_SEP",
- "SEMI_COLON","NOT","GREATER_THAN","LESS_THAN"};
- vector<pair<string,string>> vUnaryTokens= merge_vectors(vsymtkn, vsymlex);
- //Library Objects
- vector<string> vLibObj;
- vLibObj={"cout","cin","printf","size","sizeof","system","getline","endl",
- "to_string"};
- /*
- * Search lexemes for token matches
- *
- */
- for(size_t lexItr= 0; lexItr<vLex.size(); ++lexItr){
- bool found= false;
- //match string
- regex rexStr("(\".*\")");
- if(regex_match(vLex[lexItr], rexStr)){
- vTknLex.emplace_back(vLex.at(lexItr), "STRING_LIT");
- found= true;
- }
- if(found) continue;
- //match numbers
- regex rexNum("(\\d)|(\\d+.\\d+)");
- if(regex_match(vLex[lexItr], rexNum)){
- vTknLex.emplace_back(vLex.at(lexItr), "NUMERIC");
- found= true;
- }
- if(found) continue;
- //match keywords
- for(auto it : vKword){
- if(vLex[lexItr]==it){
- vTknLex.emplace_back(vLex.at(lexItr), "KEYWORD");
- found= true;
- break;
- }
- }
- if(found) continue;
- //match data types
- for(auto it : vDataTypes){
- if(vLex[lexItr]==it){
- vTknLex.emplace_back(it, "PRIM_DTYPE");
- found= true;
- break;
- }
- }
- if(found) continue;
- //match binary operators
- for(auto it : vBOpsTokens){
- if(vLex[lexItr]==it.first){
- vTknLex.emplace_back(it.first, it.second);
- found= true;
- break;
- }
- }
- if(found) continue;
- //match unary operators
- for(auto it : vUnaryTokens){
- if(vLex[lexItr]==it.first){
- vTknLex.emplace_back(it.first, it.second);
- found= true;
- break;
- }
- }
- if(found) continue;
- //match library objects
- for(auto it : vLibObj){
- if(vLex[lexItr]==it){
- vTknLex.emplace_back(it, "LIB_OBJ");
- found= true;
- break;
- }
- }
- if(found) continue;
- //left overs are identifiers
- vTknLex.emplace_back(vLex.at(lexItr), "IDENTIFIER");
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement