tag */
#define S_HEAD (1 << 6) /* inside a tag */
#define S_PRE (1 << 7) /* inside a tag */
#define QUOTE '"' /* quotation mark character */
#define CH_FILL ' ' /* fill character for indents */
#define CH_BOLD '*' /* bold character */
#define CH_H1 '=' /* heading level 1 */
#define CH_H2 '-' /* heading level 2 */
#define CH_ITALIC '~' /* italic character */
#define CH_ULINE '_' /* underline character */
#define CH_GT '>' /* "greater than" character */
#define CH_LT '<' /* "less than" character */
#define CH_UML 'e' /* "umlaut" character */
#define OPEN_ENTITY '&' /* start of SGML entity */
#define CLOSE_ENTITY ';' /* end of SGML entity */
#define NUM_ENTITY '#' /* numeric SGML entity */
#define FRAGMENT '#' /* URL fragment */
#define OPEN_TAG '<' /* start of SGML tag */
#define CLOSE_TAG '>' /* end of SGML tag */
#define END_TAG "/" /* SGML closing tag */
#define A_TAG "A" /* anchor tag */
#define BOLD_TAG "B" /* bold tag */
#define BODY_TAG "BODY" /* body tag */
#define BREAK_TAG "BR" /* line break tag */
#define DDEF_TAG "DD" /* definition tag */
#define DTERM_TAG "DT" /* term tag */
#define EM_TAG "EM" /* emphasis tag */
#define HEAD_TAG "HEAD" /* header tag */
#define HRULE_TAG "HR" /* horizontal rule tag */
#define HREF_TAG "HREF" /* hypertext reference tag */
#define H1_TAG "H1" /* heading level 1 tag */
#define H2_TAG "H2" /* heading level 2 tag */
#define H3_TAG "H3" /* heading level 3 tag */
#define H4_TAG "H4" /* heading level 4 tag */
#define H5_TAG "H5" /* heading level 5 tag */
#define H6_TAG "H6" /* heading level 6 tag */
#define ITALIC_TAG "I" /* italic tag */
#define LIST_TAG "LI" /* list element tag */
#define PARA_TAG "P" /* paragraph break */
#define PRE_TAG "PRE" /* preformatted text tag */
#define STRONG_TAG "STRONG" /* strong emphasis tag */
#define TITLE_TAG "TITLE" /* title tag */
#define UL_TAG "UL" /* unordered list tag */
#define GT_ENT "gt" /* "greater than" entity */
#define LT_ENT "lt" /* "less than" entity */
#define UML_ENT "uml" /* "umlaut" entity */
#define NEWLINE "\n" /* newline */
#define SENTENCE ".:!?" /* end of sentence characters */
#define SE_BULLET "*" /* setext bullet */
#define SE_EOF "$$\n" /* setext end of file marker */
#define SE_FORMAT "[%d]" /* setext hyperlink printf format */
#define SE_LINK ".. " /* setext hyperlink prefix */
#define SE_RULE " ____________________________________________________________"
typedef int Boolean; /* booleans are stored as ints */
typedef struct href
{
struct href *next; /* pointer to next Href */
char *url; /* hypertext link URL */
} Href; /* hypertext reference structure */
Href *href_head = NULL; /* pointer to first Href */
void heading(int col, char *out, int state)
/*
* concatenates setext heading to with size
* and type determined by
*/
{
char ch; /* heading character */
ch = (char) (TEST(state, S_H1) ? CH_H1 : CH_H2);
out += strlen(out);
while (col--)
*(out++) = ch;
*out = 0;
strcat(out, NEWLINE);
}
void wordwrap(char *in, char *out, int *state)
/*
* Wordwrap a word from and concatenate result to
* modifies as appropriate
*/
{
static int col; /* column position */
int i; /* temporary string index */
static int sentence; /* end of sentence flag */
if (!TEST(*state, S_PRE))
{
/* strip the whitespace */
while (isspace(*in))
in++;
if (*in)
{
/* wrap if right margin exceeded */
if ((col + strlen(in)) >= RMARGIN)
{
strcat(out, NEWLINE);
if (TEST(*state, S_H1 | S_H2))
heading(col, out, *state);
col = 0;
}
/* indent */
if (!TEST(*state, S_NOINDENT))
while (col < LMARGIN)
{
strcat(out, " ");
col++;
}
/* separate words with a space or two */
if (col > (TEST(*state, S_NOINDENT) ? 0 : LMARGIN))
{
if (sentence)
{
strcat(out, " ");
col++;
sentence = FALSE;
}
strcat(out, " ");
col++;
}
/* check for end of sentence */
i = strlen(in) - 1;
if ((in[i] == QUOTE) && i)
i--;
sentence = (strchr(SENTENCE, in[i]) != NULL);
}
}
/* output the word */
strcat(out, in);
col += strlen(in);
if (TEST(*state, S_FLUSH))
{
if (col > (TEST(*state, S_NOINDENT) ? 0 : LMARGIN))
{
strcat(out, NEWLINE);
if (TEST(*state, S_H1 | S_H2))
heading(col, out, *state);
col = 0;
}
RESET(*state, S_FLUSH);
}
if (TEST(*state, S_BREAK))
{
strcat(out, NEWLINE);
col = 0;
RESET(*state, S_BREAK);
}
}
void capitalise(char *s)
/* capitalise a string, except for portions in quotes */
{
Boolean quote = FALSE; /* quote flag */
while (*s)
{
if (*s == QUOTE)
quote = !quote;
else if (!quote)
*s = (char) toupper(*s);
s++;
}
}
void xtract_url(char *in, char *out)
/* look for an HREF and extract the URL */
{
in++;
while (*in)
{
/* skip whitespace */
while (isspace(*in))
in++;
if (!strncmp(in, HREF_TAG, sizeof(HREF_TAG) - 1))
{
in = strchr(in, QUOTE) + 1;
while (*in && (*in != QUOTE))
*(out++) = *(in++);
*out = 0;
}
/* skip non-whitespace */
while (*in && !isspace(*in))
in++;
}
}
int store_href(char *url)
/* store an HREF in a linked list; returns list element number */
{
int i = 0; /* list counter */
Href *href = href_head; /* hypertext reference pointer */
Href *last = NULL; /* hypertext reference pointer */
/* check if this URL is already stored in the linked list */
while (href && strcmp(href->url, url))
{
i++;
last = href;
href = href->next;
}
if (!href)
{
href = (Href *) malloc(sizeof(Href));
href->next = NULL;
href->url = strdup(url);
if (!last)
href_head = href;
else
last->next = href;
}
return i;
}
char *xlate_tag(char *in, char *word, char *out, int *state)
/*
* Translate HTML tag from and concatenate result to
* modifies as appropriate, may also concatenate to
* returns new position of pointer
*/
{
char *wp; /* word string pointer */
static char url[MAXURL]; /* URL buffer */
capitalise(in);
wp = word + strlen(word);
if (!strncmp(in, A_TAG, sizeof(A_TAG) - 1))
xtract_url(in, url);
else if (!strcmp(in, END_TAG A_TAG) && *url && (*url != FRAGMENT))
{
sprintf(wp, SE_FORMAT, store_href(url) + 1);
wp += strlen(wp);
*url = 0;
}
else if (!strcmp(in, BOLD_TAG) || !strcmp(in, END_TAG BOLD_TAG) ||
!strcmp(in, STRONG_TAG) || !strcmp(in, END_TAG STRONG_TAG))
{
*(wp++) = CH_BOLD;
*(wp++) = CH_BOLD;
}
else if (!strcmp(in, BREAK_TAG))
{
SET(*state, S_BREAK);
*wp = 0;
wordwrap(word, out, state);
wp = word;
}
else if (!strcmp(in, DTERM_TAG))
SET(*state, S_NOINDENT);
else if (!strcmp(in, DDEF_TAG))
{
SET(*state, S_FLUSH);
*wp = 0;
wordwrap(word, out, state);
wp = word;
RESET(*state, S_NOINDENT);
}
else if (!strcmp(in, EM_TAG) || !strcmp(in, END_TAG EM_TAG))
*(wp++) = CH_ULINE;
else if (!strcmp(in, H1_TAG) || !strcmp(in, H2_TAG) ||
!strcmp(in, H3_TAG) || !strcmp(in, H4_TAG) ||
!strcmp(in, H5_TAG) || !strcmp(in, H6_TAG))
{
SET(*state, S_BREAK | S_FLUSH | S_NOINDENT);
*wp = 0;
wordwrap(word, out, state);
wp = word;
if (in[1] == '1')
SET(*state, S_H1);
else if (in[1] == '2')
SET(*state, S_H2);
}
else if (!strcmp(in, END_TAG H1_TAG) || !strcmp(in, END_TAG H2_TAG) ||
!strcmp(in, END_TAG H3_TAG) || !strcmp(in, END_TAG H4_TAG) ||
!strcmp(in, END_TAG H5_TAG) || !strcmp(in, END_TAG H6_TAG))
{
SET(*state, S_BREAK | S_FLUSH);
*wp = 0;
wordwrap(word, out, state);
wp = word;
RESET(*state, S_H1 | S_H2 | S_NOINDENT);
}
else if (!strcmp(in, HEAD_TAG) || !strcmp(in, TITLE_TAG))
SET(*state, S_HEAD);
else if (!strcmp(in, END_TAG HEAD_TAG) ||
!strcmp(in, END_TAG TITLE_TAG) || !strcmp(in, BODY_TAG))
{
RESET(*state, S_HEAD);
wp = word;
}
else if (!strcmp(in, HRULE_TAG))
{
SET(*state, S_FLUSH);
*wp = 0;
wordwrap(word, out, state);
wp = word;
/* Note: this could overflow the out buffer! */
strcat(out, SE_RULE NEWLINE NEWLINE);
}
else if (!strcmp(in, ITALIC_TAG) || !strcmp(in, END_TAG ITALIC_TAG))
*(wp++) = CH_ITALIC;
else if (!strcmp(in, LIST_TAG))
{
SET(*state, S_FLUSH | S_NOINDENT);
*wp = 0;
wordwrap(word, out, state);
wp = word;
wordwrap(SE_BULLET, out, state);
RESET(*state, S_NOINDENT);
}
else if (!strcmp(in, PARA_TAG) ||
!strcmp(in, UL_TAG) || !strcmp(in, END_TAG UL_TAG))
{
SET(*state, S_BREAK | S_FLUSH);
*wp = 0;
wordwrap(word, out, state);
wp = word;
}
else if (!strcmp(in, PRE_TAG))
SET(*state, S_PRE);
else if (!strcmp(in, END_TAG PRE_TAG))
RESET(*state, S_PRE);
return wp;
}
char *xlate_entity(char *in, char *out)
/*
* Translate SGML entity from and concatenate result to
* returns new position of pointer
*/
{
if (!strcmp(in, GT_ENT))
*(out++) = CH_GT;
else if (!strcmp(in, LT_ENT))
*(out++) = CH_LT;
else if (*in == NUM_ENTITY)
*(out++) = (char) atoi(in + 1);
else if (!strcmp(in + 1, UML_ENT))
{
/* handle all umlauts by suffixing character with an 'e' */
*(out++) = *in;
*(out++) = CH_UML;
}
return out;
}
void filter(char *in, char *out)
/*
* Filter from to , processing SGML entities and HTML tags
* Note: Words are terminated by whitespace; the input stream must end
* with a whitespace character or the last word may not be output.
*/
{
static int state; /* HTML state flags */
static char entity[MAXENT]; /* entity string */
static char *ep; /* entity string pointer */
static char tag[MAXTAG]; /* tag string */
static char *tp; /* tag string pointer */
static char word[MAXWORD]; /* word string */
static char *wp = word; /* word string pointer */
while (*in)
{
if (ep && (*in == CLOSE_ENTITY))
{
*ep = 0;
if (tp)
tp = xlate_entity(entity, tp);
else
wp = xlate_entity(entity, wp);
ep = NULL;
}
else if (tp && (*in == CLOSE_TAG))
{
*wp = *tp = 0;
wp = xlate_tag(tag, word, out, &state);
tp = NULL;
}
else if (*in == OPEN_ENTITY)
ep = entity;
else if (*in == OPEN_TAG)
tp = tag;
else if (ep)
if (ep < (entity + MAXENT - 1))
*(ep++) = *in;
else
{
*wp = *ep = 0;
strcat(wp, entity);
ep = NULL;
}
else if (tp)
*(tp++) = *in;
else if (!isspace(*in) && (wp < (word + MAXWORD - 1)))
*(wp++) = *in;
else
{
if (!TEST(state, S_HEAD))
{
*wp = 0;
wordwrap(word, out, &state);
}
wp = word;
*(wp++) = *in;
}
in++;
}
}
void output_hrefs(void)
/* output setext links to stdout from HREF linked list */
{
int i = 0; /* list counter */
Href *href = href_head; /* hypertext reference pointer */
if (href)
fputs(NEWLINE, stdout);
while (href)
{
fprintf(stdout, "%s" SE_FORMAT " %s" NEWLINE,
SE_LINK, ++i, href->url);
free(href->url);
href_head = href->next;
free(href);
href = href_head;
}
}
int main(void)
/* read blocks from stdin, process and write to stdout */
{
char in[BUFSIZE]; /* input buffer */
char out[BUFSIZE]; /* output buffer */
while (fgets(in, BUFSIZE, stdin))
{
*out = 0;
filter(in, out);
fputs(out, stdout);
}
fputs(NEWLINE, stdout);
output_hrefs();
fputs(SE_EOF, stdout);
return 0;
}