flipcode - Versatile Text Scanner

Versatile Text Scanner
Submitted by

This is a text scanner that's stood me in good stead for many years. I've written zillions of parsers with it, and a couple of compilers. It's really minimal, and stateless. As a minimal stateless thing, it is simply a C style interface wrapped in a namespace. You can certainly wrap it in a class if you wish, which is how this code started out, but I've found it much more flexible as a C style API. It operates on a block of text loaded into memory. There are no files or iostreams involved. stdint.h is lifted out of the C99 specification. You can substitute any typedefs you wish; i used it because it was convenient. There's obviously lots of things not included - like substring matching, and some odd things included, like scanning past C++ style comments. Basically, the functions that got used a million zillion times over the years have stayed, and all the other stuff is gone. The interface is really easy to use. You pass in a pointer to where you are now in the text, and a pointer to the end of the text. The routine you call does its thing, and passes back a pointer to the next edible character. You can use that pointer as the new current pointer, or you can forget it and reuse your old current pointer to do something else. I've attached a simple Wavefront OBJ file reader to demonstrate how the TextScanner works. Use the code any way you wish; you assume all responsibility for its correctness and suitability should you use it. - nick

Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [TextScanner.h] - (3,435 bytes)


/*
 __  __           _           _       
|  \/  | ___  ___| |__  _   _| | __ _ 
| |\/| |/ _ \/ __| '_ \| | | | |/ _` |
| |  | |  __/\__ \ | | | |_| | | (_| |
|_|  |_|\___||___/_| |_|\__,_|_|\__,_|
 _____         _   ____                                  
|_   _|____  _| |_/ ___|  ___ __ _ _ __  _ __   ___ _ __ 
  | |/ _ \ \/ / __\___ \ / __/ _` | '_ \| '_ \ / _ \ '__|
  | |  __/>  <| |_ ___) | (_| (_| | | | | | | |  __/ |   
  |_|\___/_/\_\\__|____/ \___\__,_|_| |_|_| |_|\___|_|   
                                                         
This parser was written in the distant past by Nick Porcino for the public domain
and is freely available on an as is basis. It is meant for educational
purposes and is not suitable for any particular purpose. No warranty is expressed
or implied. Use at your own risk. Do not operate heavy machinery or governments
while using this code.

The types referenced are compatible with the C99 stdint.h types.
*/

#pragma once

#ifndef TEXTSCANNER_H
#define TEXTSCANNER_H

#include <stdint.h>

namespace TextScanner
{
	// Get Token
	char const* GetToken						(char delim, char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
	char const* GetTokenWSDelimited				(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
	char const* GetTokenAlphaNumeric			(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
	char const* GetNameSpacedTokenAlphaNumeric	(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);

	// Get Value
	char const* GetString	(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
	char const* GetInt		(char const* pCurr, char const* pEnd, int16_t& result);
	char const* GetInt		(char const* pCurr, char const* pEnd, int32_t& result);
	char const* GetInt		(char const* pCurr, char const* pEnd, uint32_t& result);
	char const* GetHex		(char const* pCurr, char const* pEnd, uint32_t& result);
	char const* GetFloat	(char const* pcurr, char const* pEnd, float& result);

	// Scan
	char const* ScanForCharacter			(char delim, char const* pCurr, char const* pEnd);
	char const* ScanBackwardsForCharacter	(char delim, char const* pCurr, char const* pEnd);
	char const* ScanForWhiteSpace			(char const* pCurr, char const* pEnd);
	char const* ScanBackwardsForWhiteSpace	(char const* pCurr, char const* pStart);
	char const* ScanForNonWhiteSpace		(char const* pCurr, char const* pEnd);
	char const* ScanForTrailingNonWhiteSpace(char const* pCurr, char const* pEnd);
	char const* ScanForQuote				(char const* pCurr, char const* pEnd);
	char const* ScanForEndOfLine			(char const* pCurr, char const* pEnd);
	char const* ScanForLastCharacterOnLine	(char const* pCurr, char const* pEnd);
	char const* ScanForBeginningOfNextLine	(char const* pCurr, char const* pEnd);
	char const* ScanPastCPPComments			(char const* pCurr, char const* pEnd);

	// Simple Tests
	template <class Type> inline bool IsWhiteSpace(Type test)	{ return (test == 9 || test == ' ' || test == 13 || test == 10);	}
	template <class Type> inline bool IsNumeric(Type test)		{ return (test >= '0' && test <= '9');	}
	template <class Type> inline bool IsAlpha(Type test)		{ return ((test >= 'a' && test <= 'z') || (test >= 'A' && test <= 'Z'));	}
}

#endif

Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [ModelReader.h] - (524 bytes)


#pragma once

#ifndef MODELREADER_H
#define MODELREADER_H

#include <vector>
using std::vector;

struct sdcVector 
{
	sdcVector() { }
	sdcVector(float _x, float _y, float _z) : x(_x), y(_y), z(_z) { }
	float x; float y; float z; 
};

class Model
{
public:
	void ReadWavefrontOBJ(char* pSource, int dataSize);

	vector <sdcVector>	vertices;
	vector <sdcVector>	normals;
	vector <sdcVector>	mapping;
	vector <int>		faceIndices;
	vector <int>		normalIndices;
	vector <int>		uvIndices;
};

#endif

Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [stdint.h] - (132 bytes)


#pragma once


typedef unsigned int uint32_t;
typedef int int32_t;
typedef unsigned short uint16_t;
typedef short int16_t;

Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [TextScanner.cpp] - (8,605 bytes)


/*
This parser was written in the distant past by Nick Porcino for the public domain
and is freely available on an as is basis. It is meant for educational
purposes and is not suitable for any particular purpose. No warranty is expressed
or implied. Use at your own risk. Do not operate heavy machinery or governments
while using this code.
*/

#include <cmath>
#include "TextScanner.h"

//! @todo replace Assert with proper error reporting mechanism

#include <assert.h>
#define Assert assert

/*----------------------------------------------------------------------------

*/

char const* TextScanner::ScanForQuote(char const* pCurr, char const* pEnd)
{
	Assert(pCurr && pEnd && pEnd >= pCurr);
	
	while (pCurr < pEnd)
	{
		if (*pCurr == '\"')
			break;
				
		++pCurr;
	}

	return pCurr;
}

char const* TextScanner::ScanForWhiteSpace(char const* pCurr, char const* pEnd)
{
	Assert(pCurr && pEnd && pEnd >= pCurr);
	
	while (pCurr < pEnd)
	{
		if (TextScanner::IsWhiteSpace(*pCurr))
			break;
				
		++pCurr;
	}

	return pCurr;
}

char const* TextScanner::ScanForNonWhiteSpace(char const* pCurr, char const* pEnd)
{
	Assert(pCurr && pEnd && pEnd >= pCurr);
	
	while (pCurr < pEnd)
	{
		if (!TextScanner::IsWhiteSpace(*pCurr))
			break;
				
		++pCurr;
	}

	return pCurr;
}

char const* TextScanner::ScanBackwardsForWhiteSpace(char const* pCurr, char const* pStart)
{
	Assert(pCurr && pStart && pStart <= pCurr);
	
	while (pCurr >= pStart)
	{
		if (TextScanner::IsWhiteSpace(*pCurr))
			break;
				
		--pCurr;
	}

	return pCurr;
}

char const* TextScanner::ScanForTrailingNonWhiteSpace(char const* pCurr, char const* pEnd)
{
	Assert(pCurr && pEnd && pEnd >= pCurr);
	
	while (pCurr < pEnd)
	{
		if (!TextScanner::IsWhiteSpace(*pEnd))
			break;

		--pEnd;
	}

	return pEnd;
}

char const* TextScanner::ScanForCharacter(char delim, char const* pCurr, char const* pEnd)
{
	Assert(pCurr && pEnd);

	while (pCurr < pEnd)
	{
		if (*pCurr == delim)
			break;

		++pCurr;
	}
	return pCurr;
}

char const* TextScanner::ScanBackwardsForCharacter(char delim, char const* pCurr, char const* pStart)
{
	Assert(pCurr && pStart && pStart <= pCurr);
	
	while (pCurr >= pStart)
	{
		if (*pCurr == delim)
			break;
				
		--pCurr;
	}

	return pCurr;
}

char const* TextScanner::GetToken(char delim, char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
	Assert(pCurr && pEnd);

	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
	resultStringBegin = pCurr;

	char const* pStringEnd = ScanForCharacter(delim, pCurr, pEnd);
	
	stringLength = (uint32_t)(pStringEnd - resultStringBegin);
	
	return pCurr;
}

char const* TextScanner::GetTokenAlphaNumeric(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
	Assert(pCurr && pEnd);

	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
	resultStringBegin = pCurr;
	stringLength = 0;

	while (pCurr < pEnd)
	{
		char test = pCurr[0];

		if (IsWhiteSpace(test))
			break;
			
		bool accept = ((test == '_') || TextScanner::IsNumeric(test) || TextScanner::IsAlpha(test));

		if (!accept)
			break;
			
		++pCurr;
		++stringLength;
	}

	return pCurr;
}

// same as above except token may also contain colons.

char const* TextScanner::GetNameSpacedTokenAlphaNumeric	(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
	Assert(pCurr && pEnd);

	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
	resultStringBegin = pCurr;
	stringLength = 0;

	while (pCurr < pEnd)
	{
		char test = pCurr[0];

		if (IsWhiteSpace(test))
			break;
			
		bool accept = ((test == ':') || (test == '_') || TextScanner::IsNumeric(test) || TextScanner::IsAlpha(test));

		if (!accept)
			break;
			
		++pCurr;
		++stringLength;
	}

	return pCurr;
}

char const* TextScanner::GetTokenWSDelimited(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
	Assert(pCurr && pEnd);

	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
	resultStringBegin = pCurr;

	char const* pStringEnd = ScanForWhiteSpace(pCurr, pEnd);
	
	stringLength = (uint32_t)(pStringEnd - resultStringBegin);
	
	return pStringEnd;
}

char const* TextScanner::GetString(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
	Assert(pCurr && pEnd && pEnd >= pCurr);

	pCurr = ScanForQuote(pCurr, pEnd);
	
	if (pCurr < pEnd)
	{
		++pCurr;	// skip past quote
		resultStringBegin = pCurr;

		pCurr = ScanForQuote(pCurr, pEnd);

		if (pCurr <= pEnd)
		{
			stringLength = (uint32_t)(pCurr - resultStringBegin);
		}
		else
			stringLength = 0;

		++pCurr;	// point past closing quote
	}
	else
		stringLength = 0;

	return pCurr;
}


char const* TextScanner::ScanForEndOfLine(char const* pCurr, char const* pEnd)
{
	while (pCurr < pEnd)
	{
		if (*pCurr == '\r')
		{
			if (pCurr[1] == '\n')
				++pCurr;
			break;
		}
		if (*pCurr == '\n')
		{
			if (pCurr[1] == '\r')
				++pCurr;
			break;
		}

		++pCurr;
	}
	return pCurr;
}

char const* TextScanner::ScanForLastCharacterOnLine(char const* pCurr, char const* pEnd)
{
	while (pCurr < pEnd)
	{
		if (pCurr[1] == '\r' || pCurr[1] == '\n' || pCurr[1] == '\0')
		{
			break;
		}

		++pCurr;
	}
	return pCurr;
}

char const* TextScanner::GetInt(char const* pCurr, char const* pEnd, int16_t& result)
{
	int32_t longresult;
	char const* retval = GetInt(pCurr, pEnd, longresult);
	result = (int16_t) longresult;
	return retval;
}

char const* TextScanner::GetInt(char const* pCurr, char const* pEnd, int32_t& result)
{
	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);

	result = 0;

	bool signFlip = false;
	
	if (*pCurr == '+')
	{
		++pCurr;
	}
	else
	if (*pCurr == '-')
	{
		++pCurr;
		signFlip = true;
	}
	
	while (pCurr < pEnd)
	{
		if (!TextScanner::IsNumeric(*pCurr))
		{
			break;
		}
		result = result * 10 + *pCurr - '0';
		++pCurr;
	}

	if (signFlip)
	{
		result = - result;
	}
	return pCurr;
}

char const* TextScanner::GetInt(char const* pCurr, char const* pEnd, uint32_t& result)
{
	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);

	result = 0;

	while (pCurr < pEnd)
	{
		if (!TextScanner::IsNumeric(*pCurr))
		{
			break;
		}
		result = result * 10 + *pCurr - '0';
		++pCurr;
	}
	return pCurr;
}

char const* TextScanner::GetFloat(char const* pCurr, char const* pEnd, float& result)
{
	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);

	result = float(0.0);

	bool signFlip = false;
	
	if (*pCurr == '+')
	{
		++pCurr;
	}
	else
	if (*pCurr == '-')
	{
		++pCurr;
		signFlip = true;
	}

	// get integer part
	int32_t intPart;
	pCurr = GetInt(pCurr, pEnd, intPart);
	result = (float) intPart;

	// get fractional part
	if (*pCurr == '.')
	{
		++pCurr;

		float scaler = 0.1f;
		while (pCurr < pEnd)
		{
			if (!TextScanner::IsNumeric(*pCurr))
			{
				break;
			}
			result = result + (float(*pCurr - '0') * scaler);
			++pCurr;
			scaler *= 0.1f;
		}
	}

	// get exponent
	if (*pCurr == 'e' || *pCurr == 'E')
	{
		++pCurr;
		
		pCurr = GetInt(pCurr, pEnd, intPart);
		result *= std::pow(10.0, (float) intPart);
	}

	if (signFlip)
	{
		result = - result;
	}
	return pCurr;
}


char const* TextScanner::GetHex(char const* pCurr, char const* pEnd, uint32_t& result)
{
	pCurr = ScanForNonWhiteSpace(pCurr, pEnd);

	result = 0;
	
	while (pCurr < pEnd)
	{
		if (TextScanner::IsNumeric(*pCurr))
		{
			result = result * 16 + *pCurr - '0';
		}
		else if (*pCurr >= 'A' && *pCurr <= 'F')
		{
			result = result * 16 + *pCurr - 'A' + 10;
		}
		else if (*pCurr >= 'a' && *pCurr <= 'f')
		{
			result = result * 16 + *pCurr - 'a' + 10;
		}
		else
		{
			break;
		}
		++pCurr;
	}

	return pCurr;
}

char const* TextScanner::ScanForBeginningOfNextLine(char const* pCurr, char const* pEnd)
{
	pCurr = ScanForEndOfLine(pCurr, pEnd);
	return (ScanForNonWhiteSpace(pCurr, pEnd));
}


char const* TextScanner::ScanPastCPPComments(char const* pCurr, char const* pEnd)
{
	if (*pCurr == '/')
	{
		if (pCurr[1] == '/')
		{
			pCurr = ScanForEndOfLine(pCurr, pEnd);
		}
		else if (pCurr[1] == '*')
		{
			pCurr = &pCurr[2];
			while (pCurr < pEnd)
			{
				if (pCurr[0] == '*' && pCurr[1] == '/')
				{
					pCurr = &pCurr[2];
					break;
				}
				
				++pCurr;
			}
		}
	}
	
	return pCurr;
}

Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [ModelReader.cpp] - (3,508 bytes)


// standard libraries
#include <vector>

using std::vector;

#include "TextScanner.h"

// model reader package
#include "ModelReader.h"


void Model::ReadWavefrontOBJ(char* pSource, int dataSize)
{
	char const* pCurr = pSource;
	char const* pEnd  = pSource + dataSize;

	while (pCurr < pEnd)
	{
		pCurr = TextScanner::ScanForNonWhiteSpace(pCurr, pEnd);

		switch (*pCurr)
		{
			case 'u':		// u is the first character of the usemtl command
			case 'g':		// group
			case '#':		// comment - skip over 'em
			default:		// unknown tokens - skip over 'em
				pCurr = TextScanner::ScanForEndOfLine(pCurr, pEnd);
				pCurr = TextScanner::ScanForNonWhiteSpace(pCurr, pEnd);
				break;

			case 'v':		// vertex
				{
					sdcVector temp(0, 0, 0);
					++pCurr;	// skip past v
					switch (*pCurr)
					{
						case 'n':	// vertex normal
							++pCurr;	// skip n
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.x);
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.y);
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.z);
							normals.push_back(temp);
							break;
						
						case 't':	// vertex texture
							++pCurr;	// skip t
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.x);
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.y);
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.z);
							mapping.push_back(temp);
							break;
						
						default:	// vertex position
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.x);
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.y);
							pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.z);
							vertices.push_back(temp);
							break;
					}
				}
				break;

			case 'f':	// face
				{
					++pCurr;	// skip past f
					char const* eol = TextScanner::ScanForEndOfLine(pCurr, pEnd);
					int v[3];
					int t[3];
					int n[3];
					int index = 0;
					while (pCurr < eol)
					{
						// get index
						pCurr = TextScanner::GetInt(pCurr, eol, v[index]);
						v[index] -= 1;
						
						++pCurr;
						if (*pCurr == '/')	// no tex coord?
						{
							t[index] = -1;
						}
						else
						{
							pCurr = TextScanner::GetInt(pCurr, eol, t[index]);
							t[index] -= 1;
						}
						++pCurr;
						if (*pCurr == '/')	// no normal?
						{
							n[index] = -1;
						}
						else
						{
							pCurr = TextScanner::GetInt(pCurr, eol, n[index]);
							n[index] -= 1;
						}
						
						++index;
						if (index == 3)
						{
							faceIndices.push_back(v[0]);
							faceIndices.push_back(v[1]);
							faceIndices.push_back(v[2]);

							normalIndices.push_back(n[0]);
							normalIndices.push_back(n[1]);
							normalIndices.push_back(n[2]);
							
							uvIndices.push_back(t[0]);
							uvIndices.push_back(t[1]);
							uvIndices.push_back(t[2]);

							// if a polygon was encountered, turn it into triangles
							// by creating a fan
							v[1] = v[2];
							n[1] = n[2];
							t[1] = t[2];
							index = 2;
						}
					}
				}
				break;
				
			case 'o':	// indicates an object name. skip it!
			case 'm':	// material library m is first character of mtllib
				{
					pCurr = TextScanner::ScanForEndOfLine(pCurr, pEnd);	// skip it for the moment
				}
				break;

			// bmf extension t
			case 't':	// beginning a triangle strip
				break;
				
			// bmf extension q
			case 'q':	// continuing a triangle strip
				break;
		}
	}
}

The zip file viewer built into the Developer Toolbox made use of the zlib library, as well as the zlibdll source additions.