|
Versatile Text Scanner
Submitted by |
This is a text scanner that's stood me in good stead for many years. I've
written zillions of parsers with it, and a couple of compilers. It's really
minimal, and stateless. As a minimal stateless thing, it is simply a C style
interface wrapped in a namespace.
You can certainly wrap it in a class if you wish, which is how this code
started out, but I've found it much more flexible as a C style API.
It operates on a block of text loaded into memory. There are no files or
iostreams involved.
stdint.h is lifted out of the C99 specification. You can substitute any
typedefs you wish; i used it because it was convenient.
There's obviously lots of things not included - like substring matching, and
some odd things included, like scanning past C++ style comments. Basically,
the functions that got used a million zillion times over the years have
stayed, and all the other stuff is gone.
The interface is really easy to use. You pass in a pointer to where you are
now in the text, and a pointer to the end of the text. The routine you call
does its thing, and passes back a pointer to the next edible character. You
can use that pointer as the new current pointer, or you can forget it and
reuse your old current pointer to do something else.
I've attached a simple Wavefront OBJ file reader to demonstrate how the
TextScanner works.
Use the code any way you wish; you assume all responsibility for its
correctness and suitability should you use it.
- nick
|
Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [TextScanner.h] - (3,435 bytes)
/*
__ __ _ _
| \/ | ___ ___| |__ _ _| | __ _
| |\/| |/ _ \/ __| '_ \| | | | |/ _` |
| | | | __/\__ \ | | | |_| | | (_| |
|_| |_|\___||___/_| |_|\__,_|_|\__,_|
_____ _ ____
|_ _|____ _| |_/ ___| ___ __ _ _ __ _ __ ___ _ __
| |/ _ \ \/ / __\___ \ / __/ _` | '_ \| '_ \ / _ \ '__|
| | __/> <| |_ ___) | (_| (_| | | | | | | | __/ |
|_|\___/_/\_\\__|____/ \___\__,_|_| |_|_| |_|\___|_|
This parser was written in the distant past by Nick Porcino for the public domain
and is freely available on an as is basis. It is meant for educational
purposes and is not suitable for any particular purpose. No warranty is expressed
or implied. Use at your own risk. Do not operate heavy machinery or governments
while using this code.
The types referenced are compatible with the C99 stdint.h types.
*/
#pragma once
#ifndef TEXTSCANNER_H
#define TEXTSCANNER_H
#include <stdint.h>
namespace TextScanner
{
// Get Token
char const* GetToken (char delim, char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
char const* GetTokenWSDelimited (char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
char const* GetTokenAlphaNumeric (char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
char const* GetNameSpacedTokenAlphaNumeric (char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
// Get Value
char const* GetString (char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength);
char const* GetInt (char const* pCurr, char const* pEnd, int16_t& result);
char const* GetInt (char const* pCurr, char const* pEnd, int32_t& result);
char const* GetInt (char const* pCurr, char const* pEnd, uint32_t& result);
char const* GetHex (char const* pCurr, char const* pEnd, uint32_t& result);
char const* GetFloat (char const* pcurr, char const* pEnd, float& result);
// Scan
char const* ScanForCharacter (char delim, char const* pCurr, char const* pEnd);
char const* ScanBackwardsForCharacter (char delim, char const* pCurr, char const* pEnd);
char const* ScanForWhiteSpace (char const* pCurr, char const* pEnd);
char const* ScanBackwardsForWhiteSpace (char const* pCurr, char const* pStart);
char const* ScanForNonWhiteSpace (char const* pCurr, char const* pEnd);
char const* ScanForTrailingNonWhiteSpace(char const* pCurr, char const* pEnd);
char const* ScanForQuote (char const* pCurr, char const* pEnd);
char const* ScanForEndOfLine (char const* pCurr, char const* pEnd);
char const* ScanForLastCharacterOnLine (char const* pCurr, char const* pEnd);
char const* ScanForBeginningOfNextLine (char const* pCurr, char const* pEnd);
char const* ScanPastCPPComments (char const* pCurr, char const* pEnd);
// Simple Tests
template <class Type> inline bool IsWhiteSpace(Type test) { return (test == 9 || test == ' ' || test == 13 || test == 10); }
template <class Type> inline bool IsNumeric(Type test) { return (test >= '0' && test <= '9'); }
template <class Type> inline bool IsAlpha(Type test) { return ((test >= 'a' && test <= 'z') || (test >= 'A' && test <= 'Z')); }
}
#endif
|
|
Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [ModelReader.h] - (524 bytes)
#pragma once
#ifndef MODELREADER_H
#define MODELREADER_H
#include <vector>
using std::vector;
struct sdcVector
{
sdcVector() { }
sdcVector(float _x, float _y, float _z) : x(_x), y(_y), z(_z) { }
float x; float y; float z;
};
class Model
{
public:
void ReadWavefrontOBJ(char* pSource, int dataSize);
vector <sdcVector> vertices;
vector <sdcVector> normals;
vector <sdcVector> mapping;
vector <int> faceIndices;
vector <int> normalIndices;
vector <int> uvIndices;
};
#endif
|
|
Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [stdint.h] - (132 bytes)
#pragma once
typedef unsigned int uint32_t;
typedef int int32_t;
typedef unsigned short uint16_t;
typedef short int16_t;
|
|
Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [TextScanner.cpp] - (8,605 bytes)
/*
This parser was written in the distant past by Nick Porcino for the public domain
and is freely available on an as is basis. It is meant for educational
purposes and is not suitable for any particular purpose. No warranty is expressed
or implied. Use at your own risk. Do not operate heavy machinery or governments
while using this code.
*/
#include <cmath>
#include "TextScanner.h"
//! @todo replace Assert with proper error reporting mechanism
#include <assert.h>
#define Assert assert
/*----------------------------------------------------------------------------
*/
char const* TextScanner::ScanForQuote(char const* pCurr, char const* pEnd)
{
Assert(pCurr && pEnd && pEnd >= pCurr);
while (pCurr < pEnd)
{
if (*pCurr == '\"')
break;
++pCurr;
}
return pCurr;
}
char const* TextScanner::ScanForWhiteSpace(char const* pCurr, char const* pEnd)
{
Assert(pCurr && pEnd && pEnd >= pCurr);
while (pCurr < pEnd)
{
if (TextScanner::IsWhiteSpace(*pCurr))
break;
++pCurr;
}
return pCurr;
}
char const* TextScanner::ScanForNonWhiteSpace(char const* pCurr, char const* pEnd)
{
Assert(pCurr && pEnd && pEnd >= pCurr);
while (pCurr < pEnd)
{
if (!TextScanner::IsWhiteSpace(*pCurr))
break;
++pCurr;
}
return pCurr;
}
char const* TextScanner::ScanBackwardsForWhiteSpace(char const* pCurr, char const* pStart)
{
Assert(pCurr && pStart && pStart <= pCurr);
while (pCurr >= pStart)
{
if (TextScanner::IsWhiteSpace(*pCurr))
break;
--pCurr;
}
return pCurr;
}
char const* TextScanner::ScanForTrailingNonWhiteSpace(char const* pCurr, char const* pEnd)
{
Assert(pCurr && pEnd && pEnd >= pCurr);
while (pCurr < pEnd)
{
if (!TextScanner::IsWhiteSpace(*pEnd))
break;
--pEnd;
}
return pEnd;
}
char const* TextScanner::ScanForCharacter(char delim, char const* pCurr, char const* pEnd)
{
Assert(pCurr && pEnd);
while (pCurr < pEnd)
{
if (*pCurr == delim)
break;
++pCurr;
}
return pCurr;
}
char const* TextScanner::ScanBackwardsForCharacter(char delim, char const* pCurr, char const* pStart)
{
Assert(pCurr && pStart && pStart <= pCurr);
while (pCurr >= pStart)
{
if (*pCurr == delim)
break;
--pCurr;
}
return pCurr;
}
char const* TextScanner::GetToken(char delim, char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
Assert(pCurr && pEnd);
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
resultStringBegin = pCurr;
char const* pStringEnd = ScanForCharacter(delim, pCurr, pEnd);
stringLength = (uint32_t)(pStringEnd - resultStringBegin);
return pCurr;
}
char const* TextScanner::GetTokenAlphaNumeric(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
Assert(pCurr && pEnd);
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
resultStringBegin = pCurr;
stringLength = 0;
while (pCurr < pEnd)
{
char test = pCurr[0];
if (IsWhiteSpace(test))
break;
bool accept = ((test == '_') || TextScanner::IsNumeric(test) || TextScanner::IsAlpha(test));
if (!accept)
break;
++pCurr;
++stringLength;
}
return pCurr;
}
// same as above except token may also contain colons.
char const* TextScanner::GetNameSpacedTokenAlphaNumeric (char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
Assert(pCurr && pEnd);
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
resultStringBegin = pCurr;
stringLength = 0;
while (pCurr < pEnd)
{
char test = pCurr[0];
if (IsWhiteSpace(test))
break;
bool accept = ((test == ':') || (test == '_') || TextScanner::IsNumeric(test) || TextScanner::IsAlpha(test));
if (!accept)
break;
++pCurr;
++stringLength;
}
return pCurr;
}
char const* TextScanner::GetTokenWSDelimited(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
Assert(pCurr && pEnd);
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
resultStringBegin = pCurr;
char const* pStringEnd = ScanForWhiteSpace(pCurr, pEnd);
stringLength = (uint32_t)(pStringEnd - resultStringBegin);
return pStringEnd;
}
char const* TextScanner::GetString(char const* pCurr, char const* pEnd, char const*& resultStringBegin, uint32_t& stringLength)
{
Assert(pCurr && pEnd && pEnd >= pCurr);
pCurr = ScanForQuote(pCurr, pEnd);
if (pCurr < pEnd)
{
++pCurr; // skip past quote
resultStringBegin = pCurr;
pCurr = ScanForQuote(pCurr, pEnd);
if (pCurr <= pEnd)
{
stringLength = (uint32_t)(pCurr - resultStringBegin);
}
else
stringLength = 0;
++pCurr; // point past closing quote
}
else
stringLength = 0;
return pCurr;
}
char const* TextScanner::ScanForEndOfLine(char const* pCurr, char const* pEnd)
{
while (pCurr < pEnd)
{
if (*pCurr == '\r')
{
if (pCurr[1] == '\n')
++pCurr;
break;
}
if (*pCurr == '\n')
{
if (pCurr[1] == '\r')
++pCurr;
break;
}
++pCurr;
}
return pCurr;
}
char const* TextScanner::ScanForLastCharacterOnLine(char const* pCurr, char const* pEnd)
{
while (pCurr < pEnd)
{
if (pCurr[1] == '\r' || pCurr[1] == '\n' || pCurr[1] == '\0')
{
break;
}
++pCurr;
}
return pCurr;
}
char const* TextScanner::GetInt(char const* pCurr, char const* pEnd, int16_t& result)
{
int32_t longresult;
char const* retval = GetInt(pCurr, pEnd, longresult);
result = (int16_t) longresult;
return retval;
}
char const* TextScanner::GetInt(char const* pCurr, char const* pEnd, int32_t& result)
{
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
result = 0;
bool signFlip = false;
if (*pCurr == '+')
{
++pCurr;
}
else
if (*pCurr == '-')
{
++pCurr;
signFlip = true;
}
while (pCurr < pEnd)
{
if (!TextScanner::IsNumeric(*pCurr))
{
break;
}
result = result * 10 + *pCurr - '0';
++pCurr;
}
if (signFlip)
{
result = - result;
}
return pCurr;
}
char const* TextScanner::GetInt(char const* pCurr, char const* pEnd, uint32_t& result)
{
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
result = 0;
while (pCurr < pEnd)
{
if (!TextScanner::IsNumeric(*pCurr))
{
break;
}
result = result * 10 + *pCurr - '0';
++pCurr;
}
return pCurr;
}
char const* TextScanner::GetFloat(char const* pCurr, char const* pEnd, float& result)
{
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
result = float(0.0);
bool signFlip = false;
if (*pCurr == '+')
{
++pCurr;
}
else
if (*pCurr == '-')
{
++pCurr;
signFlip = true;
}
// get integer part
int32_t intPart;
pCurr = GetInt(pCurr, pEnd, intPart);
result = (float) intPart;
// get fractional part
if (*pCurr == '.')
{
++pCurr;
float scaler = 0.1f;
while (pCurr < pEnd)
{
if (!TextScanner::IsNumeric(*pCurr))
{
break;
}
result = result + (float(*pCurr - '0') * scaler);
++pCurr;
scaler *= 0.1f;
}
}
// get exponent
if (*pCurr == 'e' || *pCurr == 'E')
{
++pCurr;
pCurr = GetInt(pCurr, pEnd, intPart);
result *= std::pow(10.0, (float) intPart);
}
if (signFlip)
{
result = - result;
}
return pCurr;
}
char const* TextScanner::GetHex(char const* pCurr, char const* pEnd, uint32_t& result)
{
pCurr = ScanForNonWhiteSpace(pCurr, pEnd);
result = 0;
while (pCurr < pEnd)
{
if (TextScanner::IsNumeric(*pCurr))
{
result = result * 16 + *pCurr - '0';
}
else if (*pCurr >= 'A' && *pCurr <= 'F')
{
result = result * 16 + *pCurr - 'A' + 10;
}
else if (*pCurr >= 'a' && *pCurr <= 'f')
{
result = result * 16 + *pCurr - 'a' + 10;
}
else
{
break;
}
++pCurr;
}
return pCurr;
}
char const* TextScanner::ScanForBeginningOfNextLine(char const* pCurr, char const* pEnd)
{
pCurr = ScanForEndOfLine(pCurr, pEnd);
return (ScanForNonWhiteSpace(pCurr, pEnd));
}
char const* TextScanner::ScanPastCPPComments(char const* pCurr, char const* pEnd)
{
if (*pCurr == '/')
{
if (pCurr[1] == '/')
{
pCurr = ScanForEndOfLine(pCurr, pEnd);
}
else if (pCurr[1] == '*')
{
pCurr = &pCurr[2];
while (pCurr < pEnd)
{
if (pCurr[0] == '*' && pCurr[1] == '/')
{
pCurr = &pCurr[2];
break;
}
++pCurr;
}
}
}
return pCurr;
}
|
|
Currently browsing [versatileTextScanner.zip] (4,417 bytes) - [ModelReader.cpp] - (3,508 bytes)
// standard libraries
#include <vector>
using std::vector;
#include "TextScanner.h"
// model reader package
#include "ModelReader.h"
void Model::ReadWavefrontOBJ(char* pSource, int dataSize)
{
char const* pCurr = pSource;
char const* pEnd = pSource + dataSize;
while (pCurr < pEnd)
{
pCurr = TextScanner::ScanForNonWhiteSpace(pCurr, pEnd);
switch (*pCurr)
{
case 'u': // u is the first character of the usemtl command
case 'g': // group
case '#': // comment - skip over 'em
default: // unknown tokens - skip over 'em
pCurr = TextScanner::ScanForEndOfLine(pCurr, pEnd);
pCurr = TextScanner::ScanForNonWhiteSpace(pCurr, pEnd);
break;
case 'v': // vertex
{
sdcVector temp(0, 0, 0);
++pCurr; // skip past v
switch (*pCurr)
{
case 'n': // vertex normal
++pCurr; // skip n
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.x);
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.y);
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.z);
normals.push_back(temp);
break;
case 't': // vertex texture
++pCurr; // skip t
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.x);
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.y);
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.z);
mapping.push_back(temp);
break;
default: // vertex position
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.x);
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.y);
pCurr = TextScanner::GetFloat(pCurr, pEnd, temp.z);
vertices.push_back(temp);
break;
}
}
break;
case 'f': // face
{
++pCurr; // skip past f
char const* eol = TextScanner::ScanForEndOfLine(pCurr, pEnd);
int v[3];
int t[3];
int n[3];
int index = 0;
while (pCurr < eol)
{
// get index
pCurr = TextScanner::GetInt(pCurr, eol, v[index]);
v[index] -= 1;
++pCurr;
if (*pCurr == '/') // no tex coord?
{
t[index] = -1;
}
else
{
pCurr = TextScanner::GetInt(pCurr, eol, t[index]);
t[index] -= 1;
}
++pCurr;
if (*pCurr == '/') // no normal?
{
n[index] = -1;
}
else
{
pCurr = TextScanner::GetInt(pCurr, eol, n[index]);
n[index] -= 1;
}
++index;
if (index == 3)
{
faceIndices.push_back(v[0]);
faceIndices.push_back(v[1]);
faceIndices.push_back(v[2]);
normalIndices.push_back(n[0]);
normalIndices.push_back(n[1]);
normalIndices.push_back(n[2]);
uvIndices.push_back(t[0]);
uvIndices.push_back(t[1]);
uvIndices.push_back(t[2]);
// if a polygon was encountered, turn it into triangles
// by creating a fan
v[1] = v[2];
n[1] = n[2];
t[1] = t[2];
index = 2;
}
}
}
break;
case 'o': // indicates an object name. skip it!
case 'm': // material library m is first character of mtllib
{
pCurr = TextScanner::ScanForEndOfLine(pCurr, pEnd); // skip it for the moment
}
break;
// bmf extension t
case 't': // beginning a triangle strip
break;
// bmf extension q
case 'q': // continuing a triangle strip
break;
}
}
}
|
|
The zip file viewer built into the Developer Toolbox made use
of the zlib library, as well as the zlibdll source additions.
|