924 lines
33 KiB
C#
Raw Permalink Normal View History

2024-03-25 16:14:17 +00:00
using System.Xml;
using System.Xml.Linq;
using DynamicBible.DataPreparation.Models;
using DynamicBible.Schemas;
using JMW.Extensions.String;
using Microsoft.Extensions.Logging;
using Porter2Stemmer;
using Index = DynamicBible.DataPreparation.Models.Index;
namespace DynamicBible.DataPreparation;
public class BibleProcessor(ILogger<BibleProcessor>? logger = null)
{
private readonly HashSet<string> stems = [];
private readonly HashSet<string> words = [];
private readonly HashSet<string> capitals = [];
private readonly HashSet<string> lowercase = [];
private readonly Index idx = [];
private readonly List<WordToStem> wordStemsIndex = [];
private readonly Dictionary<string, Dictionary<string, StrongCrossReference>> hebCrossRefs = new();
private readonly Dictionary<string, Dictionary<string, StrongCrossReference>> grkCrossRefs = new();
private readonly List<string> exclusions =
[
"us", "these", "her", "saith", "shalt", "let", "do", "your", "we", "no", "go", "if", "at", "an", "so", "before", "also", "on", "had",
"you", "there", "then", "up", "by", "upon", "were", "are", "this", "when", "thee", "their", "ye", "will", "as", "thy", "my", "me", "have",
"from", "was", "but", "which", "thou", "all", "it", "with", "them", "him", "they", "is", "be", "not", "his", "i", "shall", "a", "for",
"unto", "he", "in", "to", "that", "of", "and", "the",
];
private readonly char[] trims =
[
'\'', ',', ':', ';', '"', '?', '.', '[', ']', '{', '}', '<', '>', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+',
];
private readonly HashSet<char> uppers =
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
private readonly string referenceDelimiter = ":";
private readonly HashSet<string> endPunc = [".", "?", "!"];
private class BookRecord
{
public int Number { get; set; }
public string ShortName { get; set; } = string.Empty;
public string LongName { get; set; } = string.Empty;
public int LastChapter { get; set; }
public List<int> Chapters { get; set; } = [];
}
private class WordToStem
{
public string w { get; set; } = "";
public string s { get; set; } = "";
}
private record StrongCrossReference
{
public string? Word { get; set; }
public HashSet<string> Refs { get; set; } = [];
}
public void CreateText(string dataPath)
{
if (!Directory.Exists("bibles"))
{
Directory.CreateDirectory("bibles");
}
if (!Directory.Exists("bibles/kjv_strongs"))
{
Directory.CreateDirectory("bibles/kjv_strongs");
}
if (!Directory.Exists("strongscr"))
{
Directory.CreateDirectory("strongscr");
}
if (!Directory.Exists("strongs"))
{
Directory.CreateDirectory("strongs");
}
if (!Directory.Exists("index"))
{
Directory.CreateDirectory("index");
}
if (!Directory.Exists("rmac"))
{
Directory.CreateDirectory("rmac");
}
CreateStrongsDict(dataPath);
CreateRmac(dataPath);
CreateRmacCrossRefs(dataPath);
var xmlBible = XML.GetData<XMLBIBLE>(Path.Combine(dataPath, "sf_kjv_strongs_rev1b.xml"));
ArgumentNullException.ThrowIfNull(xmlBible);
CreateIndex(xmlBible);
// upddate the lower/upper lists
foreach (var word in lowercase)
{
var q = word[0].ToString().ToUpper() + word.Substring(1);
if (capitals.Contains(q))
{
capitals.Remove(q);
}
}
// iterate through text, output json format.
var bookRecords = new List<BookRecord>();
var xmlDoc = XDocument.Load(Path.Combine(dataPath, "sf_kjv_strongs_rev1b.xml"));
ArgumentNullException.ThrowIfNull(xmlDoc.Root);
foreach (var n in xmlDoc.Root.Nodes())
{
if (n.NodeType != XmlNodeType.Element)
{
continue;
}
var el = (XElement)n;
if (el.Name != "BIBLEBOOK")
{
continue;
}
var bk = new Bible
{
BookNumber = Convert.ToInt32(el.FirstAttribute?.Value),
};
var br = new BookRecord
{
Number = bk.BookNumber,
};
bookRecords.Add(br);
br.Chapters.Add(0);
foreach (var xNode in el.Nodes())
{
var chn = (XElement)xNode;
var ch = new Chapter { ChapterId = Convert.ToInt32(chn.FirstAttribute?.Value) };
var last = 0;
var lastVsW = string.Empty;
foreach (var xNode1 in chn.Nodes())
{
var vs = (XElement)xNode1;
var v = new Verse
{
VerseId = Convert.ToInt32(vs.FirstAttribute?.Value),
};
last = v.VerseId;
var first = true;
foreach (var o in vs.Nodes())
{
var wordNodes = ProcessText(o, $"{bk.BookNumber}{referenceDelimiter}{ch.ChapterId}{referenceDelimiter}{v.VerseId}");
// handle lowercasing the word if its not the start of a sentence
if (
wordNodes.Count > 0
&& first
&& lastVsW.Length > 0
&& !endPunc.Contains(lastVsW.Last().ToString())
)
{
var x = wordNodes.First().Word;
var fw = x.ParseToIndexOf(" ");
if (fw.Length > 0 && !capitals.Contains(fw))
{
x = x.Substring(0, 1).ToLower() + x.Substring(1);
wordNodes.First().Word = x;
}
}
first = false;
v.Word.AddRange(wordNodes);
if (v.Word.Count > 1
&& (
v.Word.Last().Word.StartsWith('?')
|| v.Word.Last().Word.StartsWith(';')
|| v.Word.Last().Word.StartsWith(':')
|| v.Word.Last().Word.StartsWith('.')
|| v.Word.Last().Word.StartsWith(',')
|| v.Word.Last().Word.StartsWith('!')
|| v.Word.Last().Word.StartsWith('-')
|| v.Word.Last().Word.StartsWith('\'')
)
)
{
var prev = v.Word[v.Word.Count - 2];
var curr = v.Word.Last();
if (curr.Word.StartsWith("-") || curr.Word.StartsWith("'"))
{
if (curr.Word.Trim().Contains(" "))
{
prev.Word += curr.Word.Trim().Substring(0, curr.Word.Trim().IndexOf(" ", StringComparison.Ordinal));
curr.Word = curr.Word
.Trim()
.Substring(curr.Word.Trim().IndexOf(" ", StringComparison.Ordinal)); // you want to join the two words.
}
else
{
prev.Word += curr.Word.Trim();
curr.Word = ""; // you want to join the two words.
}
}
else
{
prev.Word += curr.Word.Substring(0, 1);
curr.Word = curr.Word.Substring(1).Trim();
}
if (curr.Word.Trim().Length == 0)
{
v.Word.Remove(curr);
}
}
lastVsW = v.Word.Last().Word;
}
ch.Verses.Add(v);
}
bk.Chapters.Add(ch);
br.Chapters.Add(last);
File.WriteAllText(
"bibles/kjv_strongs/" + bk.BookNumber + "-" + ch.ChapterId + ".json",
JSON.Serialize(ch).Replace(",\"s\":\"\"", "")
);
}
logger?.LogInformation("Book: {BookNumber}", bk.BookNumber);
}
// finished.
File.WriteAllText("books.json", JSON.Serialize(bookRecords));
var lst = hebCrossRefs.OrderBy(kvp => int.Parse(kvp.Key)).ToList();
var Strongs = new List<Strongs>();
var lastStrongs = 0;
foreach (var (id, refs) in lst)
{
var s = new Strongs
{
Id = "H" + id,
Testament = "heb",
StrongsReferences = refs
.Values
.Select(
cr =>
{
return new StrongRef
{
Word = cr.Word,
BibleReferences = cr.Refs
.Select(
br => new BibleRef
{
Reference = br,
}
)
.ToList(),
};
}
)
.OrderBy(o => o.Word)
.ToList(),
};
Strongs.Add(s);
if (int.Parse(id) / 100 > lastStrongs)
{
lastStrongs = int.Parse(id) / 100;
File.WriteAllText($"strongscr/cr{s.Testament}{lastStrongs}.json", JSON.Serialize(Strongs));
Strongs = [];
logger?.LogInformation("Set: {lastStrongs}", lastStrongs);
}
}
File.WriteAllText("strongscr/crheb" + (lastStrongs + 1) + ".json", JSON.Serialize(Strongs));
logger?.LogInformation("Set: " + (lastStrongs + 1) + "\r\n");
lst = grkCrossRefs.OrderBy(kvp => int.Parse(kvp.Key)).ToList();
Strongs = [];
lastStrongs = 0;
foreach (var (id, refs) in lst)
{
var s = new Strongs
{
Id = "G" + id,
Testament = "grk",
StrongsReferences = refs
.Values
.Select(
cr =>
{
return new StrongRef
{
Word = cr.Word,
BibleReferences = cr.Refs
.Select(
br => new BibleRef
{
Reference = br,
}
)
.ToList(),
};
}
)
.OrderBy(o => o.Word)
.ToList(),
};
Strongs.Add(s);
if (int.Parse(id) / 100 > lastStrongs)
{
lastStrongs = int.Parse(id) / 100;
File.WriteAllText("strongscr/cr" + s.Testament + "" + lastStrongs + ".json", JSON.Serialize(Strongs));
Strongs = [];
logger?.LogInformation("Set: " + lastStrongs + "\r\n");
}
}
File.WriteAllText("strongscr/crgrk" + (lastStrongs + 1) + ".json", JSON.Serialize(Strongs));
logger?.LogInformation("Set: " + (lastStrongs + 1) + "\r\n");
}
private List<Text> ProcessText(object o, string location)
{
var textItems = new List<Text>();
switch (o)
{
case XElement xmlElement:
if (xmlElement.Name == "gr")
{
var strongsNumber = xmlElement.FirstAttribute?.Value;
ArgumentNullException.ThrowIfNull(strongsNumber);
var textValue = xmlElement.Value.Trim();
foreach (var val in strongsNumber.Split(' '))
{
var tv = textValue.ToLower();
if (strongsNumber.Contains(" "))
{
File.AppendAllLines("errata.txt", new List<string> { "Multiple strongs numbers found: " + location + ", " + tv });
}
var sn = val.Trim('*', ' ');
if (int.Parse(location.ParseToIndexOf(referenceDelimiter)) < 40)
{
if (!hebCrossRefs.ContainsKey(sn))
{
hebCrossRefs.Add(sn, new ());
}
if (!hebCrossRefs[sn].ContainsKey(tv))
{
hebCrossRefs[sn].Add(tv, new ());
}
hebCrossRefs[sn][tv].Word = tv;
hebCrossRefs[sn][tv].Refs.Add(location);
}
else
{
if (!grkCrossRefs.ContainsKey(sn))
{
grkCrossRefs.Add(sn, new ());
}
if (!grkCrossRefs[sn].ContainsKey(tv))
{
grkCrossRefs[sn].Add(tv, new ());
}
grkCrossRefs[sn][tv].Word = tv;
grkCrossRefs[sn][tv].Refs.Add(location);
}
}
textItems.Add(new (textValue, strongsNumber));
}
else if (xmlElement.Name.ToString().ToLower() == "style")
{
foreach (var n in xmlElement.Nodes())
{
textItems.AddRange(ProcessText(n, location));
}
}
else
{
throw new ("Unknown Element");
}
break;
case XText xmlText:
var t = xmlText.Value.Trim();
textItems.Add(new (t));
break;
default:
throw new ("Unknown Element");
}
return textItems;
}
private void CreateIndex(XMLBIBLE xmlBible)
{
// to index, you need to iterate through every word in the bible.
PopulateIndex(xmlBible);
File.WriteAllLines("word_list", stems);
idx.Sort((x, y) => string.Compare(x.Word, y.Word, StringComparison.Ordinal));
wordStemsIndex.Sort((x, y) => string.Compare(x.w, y.w, StringComparison.Ordinal));
File.WriteAllText("index/word_to_stem_idx.json", JSON.Serialize(wordStemsIndex));
var tmp = new Index();
int i;
for (i = 0; i < idx.Count; i++)
{
if (i % 50 == 49 || i == idx.Count - 1)
{
tmp.Add(idx[i]);
2024-04-17 08:50:26 -04:00
Console.WriteLine("words.unshift('" + idx[i].Word + "');");
2024-03-25 16:14:17 +00:00
var json = JSON.Serialize(tmp.ToArray());
File.WriteAllText($"index/{idx[i].Word}idx.json", json);
tmp.Clear();
}
else
{
tmp.Add(idx[i]);
}
}
}
private void PopulateIndex(XMLBIBLE b)
{
foreach (var bk in b.BIBLEBOOKS)
{
foreach (var ch in bk.CHAPTERS)
foreach (var vs in ch.VERSES)
{
if (vs.Items != null)
{
foreach (var w in vs.Items)
{
// for each word, add an entry.
if (w.GetType() == typeof(XMLBIBLE_GR))
{
var gr = (XMLBIBLE_GR)w;
if (gr.Text == null)
{
continue;
}
foreach (var textItem in gr.Text)
{
foreach (var word in textItem.Split(' '))
{
ArgumentNullException.ThrowIfNull(ch);
AddWordToIndex(
word,
bk.bnumber.ToString(),
ch.cnumber.ToString(),
vs.vnumber.ToString()
);
}
}
}
else if (w.GetType() == typeof(XMLBIBLE_STYLE_ITEM))
{
var o = (XMLBIBLE_STYLE_ITEM)w;
if (o.Text != null)
{
foreach (var textItem in o.Text)
{
foreach (var word in textItem.Split(' '))
{
ArgumentNullException.ThrowIfNull(ch);
AddWordToIndex(
word,
bk.bnumber.ToString(),
ch.cnumber.ToString(),
vs.vnumber.ToString()
);
}
}
}
if (o.gr != null)
{
var gr = o.gr;
if (gr.Text != null)
{
foreach (var t in gr.Text)
{
foreach (var s in t.Split(' '))
{
ArgumentNullException.ThrowIfNull(ch);
AddWordToIndex(
s,
bk.bnumber.ToString(),
ch.cnumber.ToString(),
vs.vnumber.ToString()
);
}
}
}
}
if (o.STYLE == null)
{
continue;
}
foreach (var so in o.STYLE)
{
if (so.Text != null)
{
foreach (var t in so.Text)
{
foreach (var s in t.Split(' '))
{
if (ch is null)
{
continue;
}
AddWordToIndex(
s,
bk.bnumber.ToString(),
ch.cnumber.ToString(),
vs.vnumber.ToString()
);
}
}
}
var gr = so.gr;
if (gr?.Value == null)
{
continue;
}
foreach (var s in gr.Value.Split(' '))
{
ArgumentNullException.ThrowIfNull(ch);
AddWordToIndex(
s,
bk.bnumber.ToString(),
ch.cnumber.ToString(),
vs.vnumber.ToString()
);
}
}
}
}
}
if (vs.Text == null)
{
continue;
}
foreach (var w in vs.Text)
{
foreach (var s in w.Split(' '))
{
if (s.Trim() != "")
{
ArgumentNullException.ThrowIfNull(ch);
AddWordToIndex(
s,
bk.bnumber.ToString(),
ch.cnumber.ToString(),
vs.vnumber.ToString()
);
}
}
}
}
logger?.LogInformation("Indexing Book: {bnumber}, Word Count: {Count}", bk.bnumber, stems.Count);
}
}
private void AddWordToIndex(string s, string bk, string ch, string vs)
{
var stemmer = new EnglishPorter2Stemmer();
var cased = s.Trim().TrimEnd(trims).TrimStart(trims).Replace("'", "");
s = cased.ToLower();
if (cased.Length > 0 && uppers.Contains(cased[0]))
{
capitals.Add(cased);
}
else if (cased.Length > 0)
{
lowercase.Add(cased);
}
if (s != "" && !exclusions.Contains(s))
{
var original = s;
var stemmedWord = stemmer.Stem(s);
s = stemmedWord.Value;
if (!words.Contains(original))
{
wordStemsIndex.Add(
new()
{
s = s, w = original,
}
);
words.Add(original);
}
// add the word to the index
if (stems.Add(s))
{
var i = new IndexItem { Word = s };
i.References.Add(bk + ":" + ch + ":" + vs);
idx.Add(i);
}
else
{
var i = idx.GetItem(s);
ArgumentNullException.ThrowIfNull(i);
if (!i.References.Contains(bk + ":" + ch + ":" + vs))
{
i.References.Add(bk + ":" + ch + ":" + vs);
}
}
}
}
public void CreateStrongsDict(string dataPath)
{
// iterate through text, output json format.
var masterDict = new Dictionary<string, StrongDictEntry>();
var grkFilenames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "grk*", SearchOption.TopDirectoryOnly);
var hebFilenames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "heb*", SearchOption.TopDirectoryOnly);
var files = grkFilenames.Concat(hebFilenames);
foreach (var f in files)
{
var doc = XDocument.Load(f);
if (doc.Root is null)
{
logger?.LogError("Unable to process RMAC, no nodes.");
return;
}
foreach (var n in doc.Root.Nodes())
{
if (n.NodeType != XmlNodeType.Element)
{
continue;
}
var el = (XElement)n;
if (el.Name != "i")
{
continue;
}
var sr = new StrongDictEntry
{
Dict = el.FirstAttribute?.Value ?? throw new NullReferenceException(),
};
if (!masterDict.TryAdd(sr.Dict, sr))
{
// duplicate?
continue;
}
foreach (var xNode in el.Nodes())
{
var d = (XElement)xNode;
if (d.Name == "d")
{
var s = d.ToString()
.Replace("<d>", "")
.Replace("</d>", "")
.Replace("><", "> <")
.Replace("<br>", "")
.Replace("[", "")
.Replace("]", "")
.Replace(";", "; ")
.Replace("<br>", "")
.Replace(" ", " ")
.Replace(" ", " ")
.Replace(" ", " ")
.Replace(" ", " ")
.Replace(" ", " ")
.Replace("\r\n", "")
.Replace("<br />", "")
.Replace("\n", "");
var parts = s.Split([ "<link", "</link>" ], StringSplitOptions.None);
foreach (var part in parts)
{
if (part.Contains("target="))
{
sr.Description.Add(
new()
{
StrongsNumber = part.ParseAfterLastIndexOf_PlusLength(">"),
}
);
}
else
{
sr.Description.Add(
new()
{
Word = part,
}
);
}
}
}
else if (d.Name == "p")
{
sr.Pronounciation = d.Value;
}
else if (d.Name == "tr")
{
sr.Translation = d.Value;
}
else if (d.Name == "t")
{
sr.Lemma = d.Value;
}
}
}
}
// combine with other javascript
foreach (var f in new List<string>
{
Path.Combine(dataPath, "strongs-greek-dictionary.json"),
Path.Combine(dataPath, "strongs-hebrew-dictionary.json"),
}
)
{
var doc = JSON.Deserialize<Dictionary<string, StrongDictEntry>>(File.ReadAllText(f));
ArgumentNullException.ThrowIfNull(doc);
var dict = doc.ToDictionary(k => k.Key, v => v.Value);
foreach (var pair in masterDict)
{
pair.Value.n = Convert.ToInt32(pair.Key.Substring(1));
if (dict.TryGetValue(pair.Key, out var dictValue))
{
if (dictValue.Lemma is not null)
{
pair.Value.Lemma = dictValue.Lemma;
}
}
}
}
var lst_heb = masterDict.Values.Where(o => o.Dict?.StartsWith("H") ?? false).OrderBy(o => o.n);
var lst_grk = masterDict.Values.Where(o => o.Dict?.StartsWith("G") ?? false).OrderBy(o => o.n);
var lsts = new List<IEnumerable<StrongDictEntry>> { lst_grk, lst_heb };
foreach (var lst in lsts)
{
var last = 0;
var temp = new List<StrongDictEntry>();
foreach (var e in lst)
{
temp.Add(e);
if (e.n / 100 <= last)
{
continue;
}
last = e.n / 100;
var first = temp.FirstOrDefault()?.Dict ?? throw new InvalidOperationException("No data where expected!");
var strongsOutputName = "strongs/" + (first.Contains('H') ? "heb" : "grk") + last + ".json";
File.WriteAllText(strongsOutputName, JSON.Serialize(temp));
temp = [];
logger?.LogInformation("Set: {last}", last);
}
// handle the last set.
last = temp.Last().n / 100 + 1;
var lastDict = temp.FirstOrDefault()?.Dict ?? throw new InvalidOperationException("No data where expected!");
var lastStrongsOutputName = "strongs/" + (lastDict.Contains('H') ? "heb" : "grk") + last + ".json";
File.WriteAllText(lastStrongsOutputName, JSON.Serialize(temp));
logger?.LogInformation("Set: {last}", last);
}
}
public void CreateRmac(string dataPath)
{
var fileNames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "r-*", SearchOption.TopDirectoryOnly);
foreach (var f in fileNames)
{
var doc = XDocument.Load(f);
var rmacs = new List<RMAC>();
if (doc.Root is null)
{
logger?.LogError("Unable to process RMAC, no nodes.");
return;
}
foreach (var n in doc.Root.Nodes())
{
if (n.NodeType != XmlNodeType.Element)
{
continue;
}
var el = (XElement)n;
if (el.Name != "i")
{
continue;
}
var r = new RMAC
{
Id = el.FirstAttribute?.Value ?? throw new InvalidOperationException("Id cannot be null"),
Description = [],
};
foreach (var xNode in el.Nodes())
{
var d = (XElement)xNode;
r.Description.Add(d.Value);
}
rmacs.Add(r);
}
var fi = new FileInfo(f);
File.WriteAllText($"rmac/{fi.Name.Substring(0, fi.Name.Length - 4)}.json", JSON.Serialize(rmacs));
logger?.LogInformation("Set: {f}", f);
}
}
public void CreateRmacCrossRefs(string dataPath)
{
var fileNames = Directory.EnumerateFiles(Path.Combine(dataPath, "xml"), "rs*", SearchOption.TopDirectoryOnly);
foreach (var f in fileNames)
{
var doc = XDocument.Load(f);
var rmacs = new List<RMACCrossRef>();
if (doc.Root is null)
{
logger?.LogError("Unable to process RMAC, no nodes.");
return;
}
foreach (var n in doc.Root.Nodes())
{
if (n.NodeType != XmlNodeType.Element)
{
continue;
}
var el = (XElement)n;
if (el.Name != "s")
{
continue;
}
var r = new RMACCrossRef
{
Id = el.FirstAttribute?.Value ?? throw new InvalidOperationException("Id cannot be null"),
Reference = el.LastAttribute?.Value ?? throw new InvalidOperationException("Reference cannot be null"),
};
rmacs.Add(r);
}
var fi = new FileInfo(f);
var i = int.Parse(fi.Name.Substring(2, 1));
File.WriteAllText($"rmac/rs{i + 1}.json", JSON.Serialize(rmacs));
logger?.LogInformation("Set: {f}", f);
}
}
}