git » libfiu » master » tree

[master] / preload / posix / utils / extract_from_man

#!/usr/bin/env python3
# encoding: utf8

"""
Extracts information from a manpage (read from stdin) that can be useful to
create modules for the code generator.

Example usage:
	man 3posix chmod | extract_from_man

Or, in a loop:

	rm -f gen.mod;
	for f in chmod chown chdir; do
		man 3posix $f | extract_from_man >> gen.mod;
	done

"""

import sys
import re


def wrap(s, cols, indent = 1):
	ns = ''
	line = ''
	for w in s.split():
		if len(line + ' ' + w) > cols:
			ns += line + ' \\\n' + '\t' * indent
			line = w
		else:
			if line:
				line += ' ' + w
			else:
				line = w

	ns += line

	return ns.rstrip()


def extract_sections(f):
	"Reads a manpage from the file, returns a dictionary of sections."
	sec_name = ''
	sec_data = ''
	sections = {}

	for l in f:
		if not l.strip():
			continue

		if l.startswith((' ', '\t')):
			sec_data += l
		else:
			sections[sec_name] = sec_data
			sec_name = l.strip()
			sec_data = ''

	sections[sec_name] = sec_data

	return sections

def get_ret_on_error(sections):
	"Tries to find out what the function returns on error."
	if 'RETURN VALUE' not in sections:
		return None

	# remove spaces and newlines to make it easier detect the patterns
	s = ' '.join(sections['RETURN VALUE'].split())
	print(s)

	# Note: the '(-|‐)' regexp matches both the normal minus sign ('-')
	# and the UTF-8 hypen sign ('‐', or \xe2\x80\x90); sadly both usually
	# look the same
	regexps = [
		r'On error,? (?P<ev>[-\w]+) is returned',
		r'On error,? .* returns? (?P<ev>[-\w]+).',
		r'some error occurs,? (?P<ev>[-\w]+) is returned',
		r'and (?P<ev>[-\w]+) if an error occurr(s|ed)',
		r'[Oo]ther((-|‐) )?wise, (?P<ev>[-\w]+) shall be returned',
		r'Other((-|‐) )?wise, the functions shall return (?P<ev>[-\w]+) and'
	]
	regexps = list(map(re.compile, regexps))

	possible_errors = []
	for regexp in regexps:
		m = regexp.search(s)
		if m:
			possible_errors.append(m.group('ev'))
	return possible_errors

def get_possible_errnos(sections):
	"""Tries to find out the possible valid errno values after the
	function has failed."""
	if 'ERRORS' not in sections:
		return None

	errnos = []

	for l in sections['ERRORS'].split('\n'):
		m = re.match(r'\s+(?P<e>([A-Z]{3,},? *)+)\s*', l)
		if m:
			s = m.group('e').strip()
			if not s:
				continue

			s = [ x.strip() for x in s.split(',') ]
			errnos.extend(s)

	return errnos

def get_defs(sections):
	"Tries to find out the includes and function definitions."
	if 'SYNOPSIS' not in sections:
		return None

	includes = []
	funcs = []

	fre = re.compile(r'\s+(?P<f>[\w,\*\s]+\(?(\w|,|\*|\s|\.\.\.)*\)?[,;])$')

	for l in sections['SYNOPSIS'].split('\n'):
		sl = l.strip()
		if sl.startswith('#include'):
			includes.append(sl.split(' ', 1)[1])

		m = fre.match(l.rstrip())
		if m:
			f = m.group('f')

			# long functions are split in multiple lines, this
			# tries to detect that and append to the last seen
			# function
			if funcs and not funcs[-1].endswith(';'):
				funcs[-1] += ' ' + f
			else:
				funcs.append(f)
	return (includes, funcs)


if __name__ == '__main__':

	if len(sys.argv) > 1:
		print(__doc__)
		sys.exit(1)

	s = extract_sections(sys.stdin)
	on_error = get_ret_on_error(s)
	errnos = get_possible_errnos(s)
	incs, funcs = get_defs(s)

	print('\n'.join( 'include: ' + i for i in incs))
	print()

	print('\n'.join(funcs))

	if on_error:
		print('\ton error:', ' || '.join(on_error))

	if errnos:
		print('\tvalid errnos:', wrap(' '.join(sorted(set(errnos))),
				60, indent = 2))