Skip to content

Instantly share code, notes, and snippets.

@dontcallmedom
Last active July 7, 2021 07:34
Show Gist options
  • Save dontcallmedom/5f620e1d2dc5ff9467b210e81a51b637 to your computer and use it in GitHub Desktop.
Save dontcallmedom/5f620e1d2dc5ff9467b210e81a51b637 to your computer and use it in GitHub Desktop.
RFC references extractor
# we're in a directory where the plain text RFCs and their json metadata have been extracted
# move obsolete RFCs to an obsolete dir
mkdir obsolete
for i in *.json ; do if [ -n "`jq '.obsoleted_by[]' $i`" ] ; then mv $i obsolete/ ; fi ; done
# working dir
mkdir refs
# directory where we will store the results:
# one file per RFC which lists the name of normatively referenced rfcs
mkdir rfc-refs
cd refs
# Ignore Informational RFCs and only deal with those that use the phrase "Normative References" as heading (not prefixed with "Non" or inside some other text, but allowing for prefixes like "A." or "Appendix" or "7.1")
for i in `grep -L "Informational" ../*.txt|xargs grep -l "Normative References"` ; do csplit -s -z $i "/^\([A0-9 ][^-\"]*\)*Normative References/" '{*}' -f "`basename $i .txt`-" ; done
# TODO: Deal with 6 documents that use the phrase "Normative references" (case)
# TODO: Deal with 3 documents that use the phrase "Normative" (case)
# TODO: Deal with 717 documents that don't use the phrase "Normative Refences"
# grep -L "Informational" *.txt|xargs grep -L "Experimental"|xargs grep -L "Normative References"|xargs grep -L "Informative References"|xargs grep "References"
# Keep the last piece of the split that doesn contain "Non-Normative References"
# Remove pagination lines (they start with a character and are 72 characters wide)
# and split-off first section by heading (lines starting with a character)
for i in *-00 ; do root=`echo $i|cut -d '-' -f 1` ; last=`ls $root-*|tail -1` ; cat $last|sed '/^\S.\{70\}./d'|csplit -s -z - "/^[0-9a-zA-Z]/" '{1}' -f $last"-"; done
# in the list of rerefences, extract anything that looks like an RFC after having removed in-paragraph line returns
for i in *-*-00 ; do rfc=`echo $i|cut -d '-' -f 1` ; cat $i|tr "\n" "_"|sed -e "s/__/\n/g"|sed -e "s/_ */ /g"|grep "RFC \?[0-9]"|sed -e "s/^.*\(RFC \?[0-9][0-9]*\).*$/\1/"|tr -d " "|sort|uniq > ../rfc-refs/$rfc ; done
# cd ../rfc-refs/
# wc -l|sort -rn
# 0 rfc5234 no rfc ref
# 0 rfc4506 no rfc ref
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment