using System.Xml; using System.Xml.Linq; using DynamicBible.DataPreparation.Models; using DynamicBible.Schemas; using JMW.Extensions.String; using Microsoft.Extensions.Logging; using Porter2Stemmer; using Index = DynamicBible.DataPreparation.Models.Index; namespace DynamicBible.DataPreparation; public class BibleProcessor(ILogger? logger = null) { private readonly HashSet stems = []; private readonly HashSet words = []; private readonly HashSet capitals = []; private readonly HashSet lowercase = []; private readonly Index idx = []; private readonly List wordStemsIndex = []; private readonly Dictionary> hebCrossRefs = new(); private readonly Dictionary> grkCrossRefs = new(); private readonly List exclusions = [ "us", "these", "her", "saith", "shalt", "let", "do", "your", "we", "no", "go", "if", "at", "an", "so", "before", "also", "on", "had", "you", "there", "then", "up", "by", "upon", "were", "are", "this", "when", "thee", "their", "ye", "will", "as", "thy", "my", "me", "have", "from", "was", "but", "which", "thou", "all", "it", "with", "them", "him", "they", "is", "be", "not", "his", "i", "shall", "a", "for", "unto", "he", "in", "to", "that", "of", "and", "the", ]; private readonly char[] trims = [ '\'', ',', ':', ';', '"', '?', '.', '[', ']', '{', '}', '<', '>', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', ]; private readonly HashSet uppers = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']; private readonly string referenceDelimiter = ":"; private readonly HashSet endPunc = [".", "?", "!"]; private class BookRecord { public int Number { get; set; } public string ShortName { get; set; } = string.Empty; public string LongName { get; set; } = string.Empty; public int LastChapter { get; set; } public List Chapters { get; set; } = []; } private class WordToStem { public string w { get; set; } = ""; public string s { get; set; } = ""; } private record StrongCrossReference { public string? Word { get; set; } public HashSet Refs { get; set; } = []; } public void CreateText(string dataPath) { if (!Directory.Exists("bibles")) { Directory.CreateDirectory("bibles"); } if (!Directory.Exists("bibles/kjv_strongs")) { Directory.CreateDirectory("bibles/kjv_strongs"); } if (!Directory.Exists("strongscr")) { Directory.CreateDirectory("strongscr"); } if (!Directory.Exists("strongs")) { Directory.CreateDirectory("strongs"); } if (!Directory.Exists("index")) { Directory.CreateDirectory("index"); } if (!Directory.Exists("rmac")) { Directory.CreateDirectory("rmac"); } CreateStrongsDict(dataPath); CreateRmac(dataPath); CreateRmacCrossRefs(dataPath); var xmlBible = XML.GetData(Path.Combine(dataPath, "sf_kjv_strongs_rev1b.xml")); ArgumentNullException.ThrowIfNull(xmlBible); CreateIndex(xmlBible); // upddate the lower/upper lists foreach (var word in lowercase) { var q = word[0].ToString().ToUpper() + word.Substring(1); if (capitals.Contains(q)) { capitals.Remove(q); } } // iterate through text, output json format. var bookRecords = new List(); var xmlDoc = XDocument.Load(Path.Combine(dataPath, "sf_kjv_strongs_rev1b.xml")); ArgumentNullException.ThrowIfNull(xmlDoc.Root); foreach (var n in xmlDoc.Root.Nodes()) { if (n.NodeType != XmlNodeType.Element) { continue; } var el = (XElement)n; if (el.Name != "BIBLEBOOK") { continue; } var bk = new Bible { BookNumber = Convert.ToInt32(el.FirstAttribute?.Value), }; var br = new BookRecord { Number = bk.BookNumber, }; bookRecords.Add(br); br.Chapters.Add(0); foreach (var xNode in el.Nodes()) { var chn = (XElement)xNode; var ch = new Chapter { ChapterId = Convert.ToInt32(chn.FirstAttribute?.Value) }; var last = 0; var lastVsW = string.Empty; foreach (var xNode1 in chn.Nodes()) { var vs = (XElement)xNode1; var v = new Verse { VerseId = Convert.ToInt32(vs.FirstAttribute?.Value), }; last = v.VerseId; var first = true; foreach (var o in vs.Nodes()) { var wordNodes = ProcessText(o, $"{bk.BookNumber}{referenceDelimiter}{ch.ChapterId}{referenceDelimiter}{v.VerseId}"); // handle lowercasing the word if its not the start of a sentence if ( wordNodes.Count > 0 && first && lastVsW.Length > 0 && !endPunc.Contains(lastVsW.Last().ToString()) ) { var x = wordNodes.First().Word; var fw = x.ParseToIndexOf(" "); if (fw.Length > 0 && !capitals.Contains(fw)) { x = x.Substring(0, 1).ToLower() + x.Substring(1); wordNodes.First().Word = x; } } first = false; v.Word.AddRange(wordNodes); if (v.Word.Count > 1 && ( v.Word.Last().Word.StartsWith('?') || v.Word.Last().Word.StartsWith(';') || v.Word.Last().Word.StartsWith(':') || v.Word.Last().Word.StartsWith('.') || v.Word.Last().Word.StartsWith(',') || v.Word.Last().Word.StartsWith('!') || v.Word.Last().Word.StartsWith('-') || v.Word.Last().Word.StartsWith('\'') ) ) { var prev = v.Word[v.Word.Count - 2]; var curr = v.Word.Last(); if (curr.Word.StartsWith("-") || curr.Word.StartsWith("'")) { if (curr.Word.Trim().Contains(" ")) { prev.Word += curr.Word.Trim().Substring(0, curr.Word.Trim().IndexOf(" ", StringComparison.Ordinal)); curr.Word = curr.Word .Trim() .Substring(curr.Word.Trim().IndexOf(" ", StringComparison.Ordinal)); // you want to join the two words. } else { prev.Word += curr.Word.Trim(); curr.Word = ""; // you want to join the two words. } } else { prev.Word += curr.Word.Substring(0, 1); curr.Word = curr.Word.Substring(1).Trim(); } if (curr.Word.Trim().Length == 0) { v.Word.Remove(curr); } } lastVsW = v.Word.Last().Word; } ch.Verses.Add(v); } bk.Chapters.Add(ch); br.Chapters.Add(last); File.WriteAllText( "bibles/kjv_strongs/" + bk.BookNumber + "-" + ch.ChapterId + ".json", JSON.Serialize(ch).Replace(",\"s\":\"\"", "") ); } logger?.LogInformation("Book: {BookNumber}", bk.BookNumber); } // finished. File.WriteAllText("books.json", JSON.Serialize(bookRecords)); var lst = hebCrossRefs.OrderBy(kvp => int.Parse(kvp.Key)).ToList(); var Strongs = new List(); var lastStrongs = 0; foreach (var (id, refs) in lst) { var s = new Strongs { Id = "H" + id, Testament = "heb", StrongsReferences = refs .Values .Select( cr => { return new StrongRef { Word = cr.Word, BibleReferences = cr.Refs .Select( br => new BibleRef { Reference = br, } ) .ToList(), }; } ) .OrderBy(o => o.Word) .ToList(), }; Strongs.Add(s); if (int.Parse(id) / 100 > lastStrongs) { lastStrongs = int.Parse(id) / 100; File.WriteAllText($"strongscr/cr{s.Testament}{lastStrongs}.json", JSON.Serialize(Strongs)); Strongs = []; logger?.LogInformation("Set: {lastStrongs}", lastStrongs); } } File.WriteAllText("strongscr/crheb" + (lastStrongs + 1) + ".json", JSON.Serialize(Strongs)); logger?.LogInformation("Set: " + (lastStrongs + 1) + "\r\n"); lst = grkCrossRefs.OrderBy(kvp => int.Parse(kvp.Key)).ToList(); Strongs = []; lastStrongs = 0; foreach (var (id, refs) in lst) { var s = new Strongs { Id = "G" + id, Testament = "grk", StrongsReferences = refs .Values .Select( cr => { return new StrongRef { Word = cr.Word, BibleReferences = cr.Refs .Select( br => new BibleRef { Reference = br, } ) .ToList(), }; } ) .OrderBy(o => o.Word) .ToList(), }; Strongs.Add(s); if (int.Parse(id) / 100 > lastStrongs) { lastStrongs = int.Parse(id) / 100; File.WriteAllText("strongscr/cr" + s.Testament + "" + lastStrongs + ".json", JSON.Serialize(Strongs)); Strongs = []; logger?.LogInformation("Set: " + lastStrongs + "\r\n"); } } File.WriteAllText("strongscr/crgrk" + (lastStrongs + 1) + ".json", JSON.Serialize(Strongs)); logger?.LogInformation("Set: " + (lastStrongs + 1) + "\r\n"); } private List ProcessText(object o, string location) { var textItems = new List(); switch (o) { case XElement xmlElement: if (xmlElement.Name == "gr") { var strongsNumber = xmlElement.FirstAttribute?.Value; ArgumentNullException.ThrowIfNull(strongsNumber); var textValue = xmlElement.Value.Trim(); foreach (var val in strongsNumber.Split(' ')) { var tv = textValue.ToLower(); if (strongsNumber.Contains(" ")) { File.AppendAllLines("errata.txt", new List { "Multiple strongs numbers found: " + location + ", " + tv }); } var sn = val.Trim('*', ' '); if (int.Parse(location.ParseToIndexOf(referenceDelimiter)) < 40) { if (!hebCrossRefs.ContainsKey(sn)) { hebCrossRefs.Add(sn, new ()); } if (!hebCrossRefs[sn].ContainsKey(tv)) { hebCrossRefs[sn].Add(tv, new ()); } hebCrossRefs[sn][tv].Word = tv; hebCrossRefs[sn][tv].Refs.Add(location); } else { if (!grkCrossRefs.ContainsKey(sn)) { grkCrossRefs.Add(sn, new ()); } if (!grkCrossRefs[sn].ContainsKey(tv)) { grkCrossRefs[sn].Add(tv, new ()); } grkCrossRefs[sn][tv].Word = tv; grkCrossRefs[sn][tv].Refs.Add(location); } } textItems.Add(new (textValue, strongsNumber)); } else if (xmlElement.Name.ToString().ToLower() == "style") { foreach (var n in xmlElement.Nodes()) { textItems.AddRange(ProcessText(n, location)); } } else { throw new ("Unknown Element"); } break; case XText xmlText: var t = xmlText.Value.Trim(); textItems.Add(new (t)); break; default: throw new ("Unknown Element"); } return textItems; } private void CreateIndex(XMLBIBLE xmlBible) { // to index, you need to iterate through every word in the bible. PopulateIndex(xmlBible); File.WriteAllLines("word_list", stems); idx.Sort((x, y) => string.Compare(x.Word, y.Word, StringComparison.Ordinal)); wordStemsIndex.Sort((x, y) => string.Compare(x.w, y.w, StringComparison.Ordinal)); File.WriteAllText("index/word_to_stem_idx.json", JSON.Serialize(wordStemsIndex)); var tmp = new Index(); int i; for (i = 0; i < idx.Count; i++) { if (i % 50 == 49 || i == idx.Count - 1) { tmp.Add(idx[i]); Console.WriteLine("words.unshift('" + idx[i].Word + "');"); var json = JSON.Serialize(tmp.ToArray()); File.WriteAllText($"index/{idx[i].Word}idx.json", json); tmp.Clear(); } else { tmp.Add(idx[i]); } } } private void PopulateIndex(XMLBIBLE b) { foreach (var bk in b.BIBLEBOOKS) { foreach (var ch in bk.CHAPTERS) foreach (var vs in ch.VERSES) { if (vs.Items != null) { foreach (var w in vs.Items) { // for each word, add an entry. if (w.GetType() == typeof(XMLBIBLE_GR)) { var gr = (XMLBIBLE_GR)w; if (gr.Text == null) { continue; } foreach (var textItem in gr.Text) { foreach (var word in textItem.Split(' ')) { ArgumentNullException.ThrowIfNull(ch); AddWordToIndex( word, bk.bnumber.ToString(), ch.cnumber.ToString(), vs.vnumber.ToString() ); } } } else if (w.GetType() == typeof(XMLBIBLE_STYLE_ITEM)) { var o = (XMLBIBLE_STYLE_ITEM)w; if (o.Text != null) { foreach (var textItem in o.Text) { foreach (var word in textItem.Split(' ')) { ArgumentNullException.ThrowIfNull(ch); AddWordToIndex( word, bk.bnumber.ToString(), ch.cnumber.ToString(), vs.vnumber.ToString() ); } } } if (o.gr != null) { var gr = o.gr; if (gr.Text != null) { foreach (var t in gr.Text) { foreach (var s in t.Split(' ')) { ArgumentNullException.ThrowIfNull(ch); AddWordToIndex( s, bk.bnumber.ToString(), ch.cnumber.ToString(), vs.vnumber.ToString() ); } } } } if (o.STYLE == null) { continue; } foreach (var so in o.STYLE) { if (so.Text != null) { foreach (var t in so.Text) { foreach (var s in t.Split(' ')) { if (ch is null) { continue; } AddWordToIndex( s, bk.bnumber.ToString(), ch.cnumber.ToString(), vs.vnumber.ToString() ); } } } var gr = so.gr; if (gr?.Value == null) { continue; } foreach (var s in gr.Value.Split(' ')) { ArgumentNullException.ThrowIfNull(ch); AddWordToIndex( s, bk.bnumber.ToString(), ch.cnumber.ToString(), vs.vnumber.ToString() ); } } } } } if (vs.Text == null) { continue; } foreach (var w in vs.Text) { foreach (var s in w.Split(' ')) { if (s.Trim() != "") { ArgumentNullException.ThrowIfNull(ch); AddWordToIndex( s, bk.bnumber.ToString(), ch.cnumber.ToString(), vs.vnumber.ToString() ); } } } } logger?.LogInformation("Indexing Book: {bnumber}, Word Count: {Count}", bk.bnumber, stems.Count); } } private void AddWordToIndex(string s, string bk, string ch, string vs) { var stemmer = new EnglishPorter2Stemmer(); var cased = s.Trim().TrimEnd(trims).TrimStart(trims).Replace("'", ""); s = cased.ToLower(); if (cased.Length > 0 && uppers.Contains(cased[0])) { capitals.Add(cased); } else if (cased.Length > 0) { lowercase.Add(cased); } if (s != "" && !exclusions.Contains(s)) { var original = s; var stemmedWord = stemmer.Stem(s); s = stemmedWord.Value; if (!words.Contains(original)) { wordStemsIndex.Add( new() { s = s, w = original, } ); words.Add(original); } // add the word to the index if (stems.Add(s)) { var i = new IndexItem { Word = s }; i.References.Add(bk + ":" + ch + ":" + vs); idx.Add(i); } else { var i = idx.GetItem(s); ArgumentNullException.ThrowIfNull(i); if (!i.References.Contains(bk + ":" + ch + ":" + vs)) { i.References.Add(bk + ":" + ch + ":" + vs); } } } } public void CreateStrongsDict(string dataPath) { // iterate through text, output json format. var masterDict = new Dictionary(); var grkFilenames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "grk*", SearchOption.TopDirectoryOnly); var hebFilenames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "heb*", SearchOption.TopDirectoryOnly); var files = grkFilenames.Concat(hebFilenames); foreach (var f in files) { var doc = XDocument.Load(f); if (doc.Root is null) { logger?.LogError("Unable to process RMAC, no nodes."); return; } foreach (var n in doc.Root.Nodes()) { if (n.NodeType != XmlNodeType.Element) { continue; } var el = (XElement)n; if (el.Name != "i") { continue; } var sr = new StrongDictEntry { Dict = el.FirstAttribute?.Value ?? throw new NullReferenceException(), }; if (!masterDict.TryAdd(sr.Dict, sr)) { // duplicate? continue; } foreach (var xNode in el.Nodes()) { var d = (XElement)xNode; if (d.Name == "d") { var s = d.ToString() .Replace("", "") .Replace("", "") .Replace("><", "> <") .Replace("
", "") .Replace("[", "") .Replace("]", "") .Replace(";", "; ") .Replace("
", "") .Replace(" ", " ") .Replace(" ", " ") .Replace(" ", " ") .Replace(" ", " ") .Replace(" ", " ") .Replace("\r\n", "") .Replace("
", "") .Replace("\n", ""); var parts = s.Split([ "" ], StringSplitOptions.None); foreach (var part in parts) { if (part.Contains("target=")) { sr.Description.Add( new() { StrongsNumber = part.ParseAfterLastIndexOf_PlusLength(">"), } ); } else { sr.Description.Add( new() { Word = part, } ); } } } else if (d.Name == "p") { sr.Pronounciation = d.Value; } else if (d.Name == "tr") { sr.Translation = d.Value; } else if (d.Name == "t") { sr.Lemma = d.Value; } } } } // combine with other javascript foreach (var f in new List { Path.Combine(dataPath, "strongs-greek-dictionary.json"), Path.Combine(dataPath, "strongs-hebrew-dictionary.json"), } ) { var doc = JSON.Deserialize>(File.ReadAllText(f)); ArgumentNullException.ThrowIfNull(doc); var dict = doc.ToDictionary(k => k.Key, v => v.Value); foreach (var pair in masterDict) { pair.Value.n = Convert.ToInt32(pair.Key.Substring(1)); if (dict.TryGetValue(pair.Key, out var dictValue)) { if (dictValue.Lemma is not null) { pair.Value.Lemma = dictValue.Lemma; } } } } var lst_heb = masterDict.Values.Where(o => o.Dict?.StartsWith("H") ?? false).OrderBy(o => o.n); var lst_grk = masterDict.Values.Where(o => o.Dict?.StartsWith("G") ?? false).OrderBy(o => o.n); var lsts = new List> { lst_grk, lst_heb }; foreach (var lst in lsts) { var last = 0; var temp = new List(); foreach (var e in lst) { temp.Add(e); if (e.n / 100 <= last) { continue; } last = e.n / 100; var first = temp.FirstOrDefault()?.Dict ?? throw new InvalidOperationException("No data where expected!"); var strongsOutputName = "strongs/" + (first.Contains('H') ? "heb" : "grk") + last + ".json"; File.WriteAllText(strongsOutputName, JSON.Serialize(temp)); temp = []; logger?.LogInformation("Set: {last}", last); } // handle the last set. last = temp.Last().n / 100 + 1; var lastDict = temp.FirstOrDefault()?.Dict ?? throw new InvalidOperationException("No data where expected!"); var lastStrongsOutputName = "strongs/" + (lastDict.Contains('H') ? "heb" : "grk") + last + ".json"; File.WriteAllText(lastStrongsOutputName, JSON.Serialize(temp)); logger?.LogInformation("Set: {last}", last); } } public void CreateRmac(string dataPath) { var fileNames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "r-*", SearchOption.TopDirectoryOnly); foreach (var f in fileNames) { var doc = XDocument.Load(f); var rmacs = new List(); if (doc.Root is null) { logger?.LogError("Unable to process RMAC, no nodes."); return; } foreach (var n in doc.Root.Nodes()) { if (n.NodeType != XmlNodeType.Element) { continue; } var el = (XElement)n; if (el.Name != "i") { continue; } var r = new RMAC { Id = el.FirstAttribute?.Value ?? throw new InvalidOperationException("Id cannot be null"), Description = [], }; foreach (var xNode in el.Nodes()) { var d = (XElement)xNode; r.Description.Add(d.Value); } rmacs.Add(r); } var fi = new FileInfo(f); File.WriteAllText($"rmac/{fi.Name.Substring(0, fi.Name.Length - 4)}.json", JSON.Serialize(rmacs)); logger?.LogInformation("Set: {f}", f); } } public void CreateRmacCrossRefs(string dataPath) { var fileNames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "rs*", SearchOption.TopDirectoryOnly); foreach (var f in fileNames) { var doc = XDocument.Load(f); var rmacs = new List(); if (doc.Root is null) { logger?.LogError("Unable to process RMAC, no nodes."); return; } foreach (var n in doc.Root.Nodes()) { if (n.NodeType != XmlNodeType.Element) { continue; } var el = (XElement)n; if (el.Name != "s") { continue; } var r = new RMACCrossRef { Id = el.FirstAttribute?.Value ?? throw new InvalidOperationException("Id cannot be null"), Reference = el.LastAttribute?.Value ?? throw new InvalidOperationException("Reference cannot be null"), }; rmacs.Add(r); } var fi = new FileInfo(f); var i = int.Parse(fi.Name.Substring(2, 1)); File.WriteAllText($"rmac/rs{i + 1}.json", JSON.Serialize(rmacs)); logger?.LogInformation("Set: {f}", f); } } }