2024-03-25 16:14:17 +00:00

190 lines
9.1 KiB
C#

using DynamicBibleUtility.Geolocation;
namespace DynamicBible.DataPreparation.Models.Geolocation;
/// <summary>
/// A parser for Biblical location information from openbible.info.
/// Specifically, this class only handles parsing the tab-delimited
/// file from https://www.openbible.info/geo/data/merged.txt.
/// Parsing this specific file was chosen over the KMZ/KML files
/// because it was a much simpler way to get the relevant data.
/// The "merged" version of the raw data was chosen over the
/// "unmerged" version because it seemed to contain more data.
/// The data parsed by this parser is licensed under the
/// Creative Commons Attribution license (see
/// https://www.openbible.info/geo/ and
/// https://creativecommons.org/licenses/by/4.0/).
/// Strong's numbers are added using <see cref="LocationNameToStrongsNumberLookup" />.
/// </summary>
public class OpenBibleDotInfoLocationParser
{
/// <summary>
/// Parses Biblical location information from the specified file.
/// </summary>
/// <param name="filepath">The relative or absolute path to the file to parse.</param>
/// <returns>Location references parsed from the file; never null.</returns>
/// <exception cref="Exception">Thrown if a parsing error occurs.</exception>
public static IEnumerable<BibleLocationReference> Parse(string filepath)
{
// READ THE ENTIRE GEOLOCATION DATA FILE.
// It is small enough to store completely in memory.
var geolocation_input_file_lines = File.ReadAllLines(filepath);
// PARSE EACH LINE OF GEOLOCATION DATA.
// The first line contains a comment and the second line contains a header,
// so those two lines can be skipped.
const int FIRST_GEOLOCATION_LINE_INDEX = 2;
var locations = new List<BibleLocationReference>();
for (var line_index = FIRST_GEOLOCATION_LINE_INDEX; line_index < geolocation_input_file_lines.Length; ++line_index)
{
// SPLIT THE LINE INTO SEPARATE FIELDS.
// Since empty fields sometimes exist in the actual data, empty entries are still included
// from the string splitting operation to make indexing into known fields simpler.
const char FIELD_SEPARATOR = '\t';
var current_geolocation_line = geolocation_input_file_lines[line_index];
var current_line_fields = current_geolocation_line.Split(
new [] { FIELD_SEPARATOR },
StringSplitOptions.None
);
// PARSE THE LOCATION INFORMATION FROM CURRENT LINE.
var location = new BibleLocationReference();
// The name is converted to lowercase to make it easier to do
// case insensitive lookups.
const int BIBLE_LOCATION_NAME_FIELD_INDEX = 0;
location.Name = current_line_fields[BIBLE_LOCATION_NAME_FIELD_INDEX];
location.Name = location.Name.ToLower();
// The file contains both the name of the location as mentioned in the Bible (parsed above)
// and this second name for the actual location that the geographic coordinates reference.
// Since the geographics coordinates are expected to be close enough to the Biblical name
// and the primary purpose of this data is to cross-reference the Biblical text,
// this second name is silently ignored but could be added later if desired.
const int GEO_COORDINATE_LOCATION_NAME_FIELD_INDEX = 1;
var geo_coordinate_location_name = current_line_fields[GEO_COORDINATE_LOCATION_NAME_FIELD_INDEX];
const int LATITUDE_INDEX = 2;
var latitude_string = current_line_fields[LATITUDE_INDEX];
location.Latitude = ParseGeographicCoordinate(latitude_string);
const int LONGITUDE_INDEX = 3;
var longitude_string = current_line_fields[LONGITUDE_INDEX];
location.Longitude = ParseGeographicCoordinate(longitude_string);
const int VERSE_REFERENCES_INDEX = 4;
var verse_references_csv_list = current_line_fields[VERSE_REFERENCES_INDEX];
location.VerseReferences = ParseVerseReferences(verse_references_csv_list);
// ADD STRONG'S NUMBERS REFERENCES TO THE LOCATION.
location.StrongsNumbers = LocationNameToStrongsNumberLookup.GetStrongsNumbers(location.Name);
// ADD THE LOCATION INFORMATION FOR RETURNING.
locations.Add(location);
}
return locations;
}
/// <summary>
/// Attempts to parse a geographic coordinate from the specified string.
/// This method is necessary because not all coordinate values in the file
/// are necessarily completely numeric.
/// </summary>
/// <param name="coordinate_string">The coordinate string to parse.</param>
/// <returns>
/// The geographic coordinate, if successfully parsed.
/// Null only if no geographic coordinate exists (an exception is thrown
/// if an unexpected parsing error occurs in order to provide easier visibilty
/// into such errors).
/// </returns>
/// <exception cref="Exception">Thrown if a parsing error occurs.</exception>
private static double? ParseGeographicCoordinate(string coordinate_string)
{
// REMOVE ANY KNOWN NON-NUMERIC CHARACTERS FROM THE STRING.
// These characters are used to mark cases where the location isn't known
// or the location may not be exact. That exactness isn't super important
// in this context, so the "marker" characters are ignored.
var numeric_coordinate_string = coordinate_string.Trim('?', '~', '<', '>');
// A '-' is used sometimes to indicate no location. Since a '-' could also
// be used for a negative geographic coordinate, it can only be safely
// trimmed from the end.
numeric_coordinate_string = numeric_coordinate_string.TrimEnd('-');
// CHECK IF A COORDINATE EXISTS.
var coordinate_exists = !string.IsNullOrWhiteSpace(numeric_coordinate_string);
if (!coordinate_exists)
{
// Not all locations in this file may have geographic coordinates.
return null;
}
// PARSE THE NUMERIC COORDINATE.
var coordinate = double.Parse(numeric_coordinate_string);
return coordinate;
}
private static readonly char[] VerseSeparator = [ ':' ];
/// <summary>
/// Attempts to parse Bible verse references from a CSV list.
/// </summary>
/// <param name="verse_references_csv_list">
/// A CSV list of Bible verse references.
/// Each reference is expected to be separated by a comma OR a comma and single space.
/// </param>
/// <returns>The verse references from the string; an empty list if no verse references exist in the string.</returns>
/// <exception cref="Exception">Thrown if a parsing error occurs.</exception>
private static IEnumerable<BibleVerseReference> ParseVerseReferences(string verse_references_csv_list)
{
// GET THE INDIVIDUAL VERSE REFERENCE STRINGS FROM THE LIST.
var verse_reference_strings = verse_references_csv_list.Split(
[ ", ", "," ],
StringSplitOptions.RemoveEmptyEntries
);
// PARSE EACH VERSE REFERENCE.
var verse_references = new List<BibleVerseReference>();
foreach (var verse_reference_string in verse_reference_strings)
{
// PARSE THE BOOK.
// A single space separates the book from the chapter and verse numbers.
// Since there might be an additional space before that separator
// for books with numbers at the start, a split can't be used directly.
const int BOOK_START_INDEX = 0;
var index_of_space_after_book = verse_reference_string.LastIndexOf(' ');
var book_string_length_in_characters = index_of_space_after_book;
var book_string = verse_reference_string.Substring(BOOK_START_INDEX, book_string_length_in_characters);
var book = new BibleBook(book_string);
// PARSE THE CHAPTER.
// A single colon separates the chapter and verse numbers.
var chapter_start_index = index_of_space_after_book + 1;
var chapter_and_verse_string = verse_reference_string.Substring(chapter_start_index);
var chapter_and_verse_numbers = chapter_and_verse_string.Split(
VerseSeparator,
StringSplitOptions.RemoveEmptyEntries
);
const int CHAPTER_INDEX = 0;
var chapter_string = chapter_and_verse_numbers[CHAPTER_INDEX];
var chapter = int.Parse(chapter_string);
// PARSE THE VERSE.
const int VERSE_INDEX = 1;
var verse_string = chapter_and_verse_numbers[VERSE_INDEX];
var verse = int.Parse(verse_string);
// ADD THE PARSED THE BIBLE VERSE REFERENCE.
var verse_reference = new BibleVerseReference
{
Book = book,
Chapter = chapter,
Verse = verse,
};
verse_references.Add(verse_reference);
}
return verse_references;
}
}