/* $Id: SubSource.cpp 665170 2023-03-29 15:43:00Z ivanov $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 * Author:  .......
 *
 * File Description:
 *   .......
 *
 * Remark:
 *   This code was originally generated by application DATATOOL
 *   using the following specifications:
 *   'seqfeat.asn'.
 */

// standard includes
#include <ncbi_pch.hpp>
#include <serial/enumvalues.hpp>

// generated includes
#include <objects/seqfeat/SubSource.hpp>

#include <math.h>
#include <objects/misc/sequence_util_macros.hpp>
#include <corelib/ncbitime.hpp>

#include <util/row_reader_ncbi_tsv.hpp>
#include <mutex>
#include <util/compile_time.hpp>

// generated classes

BEGIN_NCBI_SCOPE

BEGIN_objects_SCOPE // namespace ncbi::objects::

unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonCountryMap;
unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonWaterMap;


// destructor
CSubSource::~CSubSource(void)
{
}

void CSubSource::GetLabel(string* str) const
{
    *str += '/';
    string type_name;
    if (GetSubtype() == eSubtype_other) {
        type_name = "other";
    } else {
        try {
            // eVocabulary_insdc has some special cases not (historically)
            // used here.
            type_name = GetSubtypeName(GetSubtype());
            replace(type_name.begin(), type_name.end(), '_', '-');
        } catch (const CSerialException&) {
            type_name = "unknown";
        }
    }
    *str += type_name;
    *str += '=';
    *str += GetName();
    if (IsSetAttrib()) {
        *str += " (";
        *str += GetAttrib();
        *str += ")";
    }
}


CSubSource::TSubtype CSubSource::GetSubtypeValue(const string& str,
                                                 EVocabulary vocabulary)
{
    string name = NStr::TruncateSpaces(str);
    NStr::ToLower(name);
    replace(name.begin(), name.end(), '_', '-');
    replace(name.begin(), name.end(), ' ', '-');

    if ( NStr::EqualNocase(name, "note") ||
         NStr::EqualNocase(name, "subsource-note") ||
         NStr::EqualNocase(name, "subsrc-note") ||
         NStr::EqualNocase(name, "note-subsource")) {
        return eSubtype_other;
    } else if (vocabulary == eVocabulary_insdc) {
        // consider a table if more special cases arise.
        if (name == "insertion-seq") {
            return eSubtype_insertion_seq_name;
        } else if (name == "plasmid") {
            return eSubtype_plasmid_name;
        } else if (name == "transposon") {
            return eSubtype_transposon_name;
        } else if (name == "sub-clone") {
            return eSubtype_subclone;
        }
    }
    return ENUM_METHOD_NAME(ESubtype)()->FindValue(name);
}


bool CSubSource::IsValidSubtypeName(const string& str,
                                    EVocabulary vocabulary)
{

    string name = NStr::TruncateSpaces(str);
    NStr::ToLower(name);
    replace(name.begin(), name.end(), '_', '-');
    replace(name.begin(), name.end(), ' ', '-');

    if ( NStr::EqualNocase(name, "note") ||
         NStr::EqualNocase(name, "subsource-note") ||
         NStr::EqualNocase(name, "subsrc-note") ||
         NStr::EqualNocase(name, "note-subsource")) {
         return true;
    }
    if (vocabulary == eVocabulary_insdc) {
        // consider a table if more special cases arise.
        if (name == "insertion-seq" ||
            name == "plasmid" ||
            name == "transposon" ||
            name == "sub-clone") {
            return true;
        }
    }
    return ENUM_METHOD_NAME(ESubtype)()->IsValidName(name);
}


string CSubSource::GetSubtypeName(CSubSource::TSubtype stype,
                                  EVocabulary vocabulary)
{
    if (stype == CSubSource::eSubtype_other) {
        return "note";
    } else if (vocabulary == eVocabulary_insdc) {
        switch (stype) {
        case eSubtype_subclone:           return "sub_clone";
        case eSubtype_plasmid_name:       return "plasmid";
        case eSubtype_transposon_name:    return "transposon";
        case eSubtype_insertion_seq_name: return "insertion_seq";
        default:
            return NStr::Replace
                (ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true),
                 "-", "_");
        }
    } else {
        return ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true);
    }
}



bool CSubSource::IsMultipleValuesAllowed(TSubtype subtype)
{
    return subtype != eSubtype_chromosome
        && subtype != eSubtype_sex
        && subtype != eSubtype_germline
        && subtype != eSubtype_rearranged
        && subtype != eSubtype_plasmid_name
        && subtype != eSubtype_segment
        && subtype != eSubtype_country
        && subtype != eSubtype_transgenic
        && subtype != eSubtype_environmental_sample
        && subtype != eSubtype_lat_lon
        && subtype != eSubtype_collection_date
        && subtype != eSubtype_collected_by
        && subtype != eSubtype_identified_by
        && subtype != eSubtype_fwd_primer_seq
        && subtype != eSubtype_rev_primer_seq
        && subtype != eSubtype_fwd_primer_name
        && subtype != eSubtype_rev_primer_name
        && subtype != eSubtype_metagenomic
        && subtype != eSubtype_altitude
        && subtype != eSubtype_clone;
}


bool CSubSource::NeedsNoText(const TSubtype& subtype)
{
    if (subtype == eSubtype_germline
        || subtype == eSubtype_rearranged
        || subtype == eSubtype_transgenic
        || subtype == eSubtype_environmental_sample
        || subtype == eSubtype_metagenomic) {
        return true;
    } else {
        return false;
    }
}


bool CSubSource::IsDiscouraged(const TSubtype subtype)
{
    if (subtype == eSubtype_frequency
        || subtype == eSubtype_insertion_seq_name
        || subtype == eSubtype_phenotype
        || subtype == eSubtype_plastid_name
        || subtype == eSubtype_transposon_name
        || subtype == eSubtype_fwd_primer_seq
        || subtype == eSubtype_rev_primer_seq
        || subtype == eSubtype_fwd_primer_name
        || subtype == eSubtype_rev_primer_name
        || subtype == eSubtype_whole_replicon) {  // metagenomic subsrc qualifier taken off this list: GB-3384
        return true;
    } else {
        return false;
    }
}


bool CSubSource::IsDayValueOkForMonth(int day, int month, int year)
{
    if (month < 1 || month > 12 || day < 1) {
        return false;
    }
    bool rval = true;
    if (year < 100) {
        year += 2000;
    } else if (year > 3000) {
        return false;
    } else if (year < 1538) {
        return false;
    }
    CTime month_o(year, month, 1);
    if (day > month_o.DaysInMonth()) {
        rval = false;
    }
    return rval;
}


CRef<CDate> CSubSource::DateFromCollectionDate (const string& test) THROWS((CException))
{
    if (NStr::IsBlank(test)) {
        NCBI_THROW (CException, eUnknown,
                        "collection-date string is blank");
    }
    string str = NStr::TruncateSpaces(test);

    if (IsISOFormatDate(str)) {
        return GetDateFromISODate(str);
    }

    size_t pos = NStr::Find(str, "-");
    string year;
    string month;
    string day;

    if (pos == NPOS) {
        year = str;
    } else {
        size_t pos2 = NStr::Find(str, "-", pos + 1);
        if (pos2 == NPOS) {
            month = str.substr(0, pos);
            year = str.substr(pos + 1);
            if (NStr::IsBlank(month)) {
                NCBI_THROW (CException, eUnknown,
                                "collection-date string is improperly formatted");
            }
        } else {
            day = str.substr(0, pos);
            month = str.substr(pos + 1, pos2 - pos - 1);
            year = str.substr(pos2 + 1);
            if (NStr::IsBlank(month) || NStr::IsBlank(day)) {
                NCBI_THROW (CException, eUnknown,
                                "collection-date string is improperly formatted");
            }
        }
    }

    int month_val = 0;
    if (!NStr::IsBlank(month)) {
        try {
            month_val = CTime::MonthNameToNum(month);
        } catch (const CTimeException&) {
            NCBI_THROW (CException, eUnknown,
                            "collection-date string has invalid month");
        }
    }

    int day_val = 0;
    if (!NStr::IsBlank(day)) {
        try {
            day_val = NStr::StringToInt (day);
            if (day_val < 1) {
                NCBI_THROW (CException, eUnknown,
                                "collection-date string has invalid day value");
            }
        } catch ( const exception& ) {
            // threw exception while converting to int
            NCBI_THROW (CException, eUnknown,
                            "collection-date string is improperly formatted");
        }
    }

    if (NStr::IsBlank(year)) {
        NCBI_THROW (CException, eUnknown,
                        "collection-date string is improperly formatted");
    }

    int year_val = 0;
    try {
        year_val = NStr::StringToInt (year);
    } catch ( const exception& ) {
        // threw exception while converting to int
        NCBI_THROW (CException, eUnknown,
                        "collection-date string is improperly formatted");
    }

    /*
    if (year_val < 1000 || year_val >= 2100) {
        NCBI_THROW (CException, eUnknown,
                        "collection-date year is out of range");
    }
    */

    if (year_val < 1000) {
        NCBI_THROW (CException, eUnknown,
                        "collection-date year is out of range");
    }

    if (year_val >= 2100) {
        NCBI_THROW (CException, eUnknown,
                        "collection-date year is out of range");
    }

    if (day_val > 0 && month_val > 0 && !IsDayValueOkForMonth(day_val, month_val, year_val)) {
        NCBI_THROW (CException, eUnknown,
                        "collection-date day is greater than monthly maximum");
    }

    CRef<CDate> date(new CDate);

    date->SetStd().SetYear (year_val);
    if (month_val > 0) {
        date->SetStd().SetMonth (month_val);
    }
    if (day_val > 0) {
        date->SetStd().SetDay (day_val);
    }

    time_t t;

    time(&t);

    CDate now(t);

    /*
    if (IsCollectionDateAfterTime(*date, t)) {
         NCBI_THROW (CException, eUnknown,
                        "collection-date year is out of range");
    }
    */

    return date;
}


bool CSubSource::IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format)
{
    bad_format = false;
    bool in_future = false;
    vector<string> pieces;
    NStr::Split(collection_date, "/", pieces);
    if (pieces.size() > 2) {
        bad_format = true;
    } else {
        ITERATE(vector<string>, it, pieces) {
            CRef<CDate> coll_date = DateFromCollectionDate (*it);
            if (!coll_date) {
                bad_format = true;
            } else if (IsCollectionDateAfterTime(*coll_date, t)) {
                in_future = true;
            }
        }
    }
    return in_future;
}


bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, time_t t)
{
    CDate now(t);
    if (collection_date.Compare(now) == CDate::eCompare_after) {
        return true;
    } else {
        return false;
    }
}


bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime)
{
    time_t t = ctime.GetTimeT();
    return IsCollectionDateAfterTime(collection_date, t);
}


void CSubSource::IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future)
{
    bad_format = false;
    in_future = false;

    vector<string> pieces;
    NStr::Split(date_string, "/", pieces);
    if (pieces.size() > 2) {
        bad_format = true;
        return;
    } else if (pieces.size() == 2) {
        bool first_bad = false;
        bool first_future = false;
        bool second_bad = false;
        bool second_future = false;
        IsCorrectDateFormat(pieces[0], first_bad, first_future);
        IsCorrectDateFormat(pieces[1], second_bad, second_future);
        bad_format = first_bad || second_bad;
        if (!bad_format) {
            in_future = first_future || second_future;
        }
        return;
    }

    try {
        CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (date_string);

        if (!IsISOFormatDate(date_string)) {
            // if there are two dashes, then the first token needs to be the day, and the
            // day has to have two numbers, a leading zero if the day is less than 10
            size_t pos = NStr::Find(date_string, "-");
            if (pos != NPOS) {
                size_t pos2 = NStr::Find(date_string, "-", pos + 1);
                if (pos2 != NPOS  &&  pos != 2) {
                    bad_format = true;
                }
            }
        }

        if (!bad_format) {
            time_t t;

            time(&t);

            in_future = IsCollectionDateAfterTime(*coll_date, t);
        }
    } catch (const CException& ) {
        bad_format = true;
    }
}

size_t CSubSource::CheckDateFormat(const string& date_string)
{
    size_t rval = eDateFormatFlag_ok;
    vector<string> pieces;
    NStr::Split(date_string, "/", pieces);
    if (pieces.size() > 2) {
        rval |= eDateFormatFlag_bad_format;
    } else if (pieces.size() == 2) {
        rval |= CheckDateFormat(pieces[0]);
        rval |= CheckDateFormat(pieces[1]);
        if (rval == eDateFormatFlag_ok) {
            try {
                CRef<CDate> d1 = CSubSource::DateFromCollectionDate(pieces[0]);
                CRef<CDate> d2 = CSubSource::DateFromCollectionDate(pieces[1]);
                if (d2->Compare(*d1) == CDate::eCompare_before) {
                    rval |= eDateFormatFlag_out_of_order;
                }
            } catch (const CException&) {
                rval |= eDateFormatFlag_bad_format;
            }
        }
        return rval;
    }

    try {
        CRef<CDate> coll_date = CSubSource::DateFromCollectionDate(date_string);

        if (!IsISOFormatDate(date_string)) {
            // if there are two dashes, then the first token needs to be the day, and the
            // day has to have two numbers, a leading zero if the day is less than 10
            size_t pos = NStr::Find(date_string, "-");
            if (pos != NPOS) {
                size_t pos2 = NStr::Find(date_string, "-", pos + 1);
                if (pos2 != NPOS  &&  pos != 2) {
                    rval |= eDateFormatFlag_bad_format;
                }
            }
        }

        if (rval == eDateFormatFlag_ok) {
            time_t t;

            time(&t);
            if (IsCollectionDateAfterTime(*coll_date, t)) {
                rval |= eDateFormatFlag_in_future;
            }
        }
    } catch (const CException&) {
        rval |= eDateFormatFlag_bad_format;
    }
    return rval;
}

typedef CStaticArraySet<const char*, PCase_CStr> TCStrSet;

// null term exemption values, order is not important
MAKE_CONST_SET(s_Null_CollectionDatesSet, ct::tagStrCase,
{
    "missing",
    "missing: control sample",
    "missing: data agreement established pre-2023",
    "missing: endangered species",
    "missing: human-identifiable",
    "missing: lab stock",
    "missing: sample group",
    "missing: synthetic construct",
    "missing: third party data",
    "not applicable",
    "not collected",
    "not provided",
    "restricted access",
})

string CSubSource::GetCollectionDateProblem (const string& date_string)
{
    string problem;
    if (s_Null_CollectionDatesSet.find(date_string.c_str()) != s_Null_CollectionDatesSet.end()) {
        return problem;
    }
    size_t rval = CheckDateFormat(date_string);
    if (rval & eDateFormatFlag_bad_format) {
        problem = "Collection_date format is not in DD-Mmm-YYYY format";
    } else if (rval & eDateFormatFlag_in_future) {
        problem = "Collection_date is in the future";
    } else if (rval & eDateFormatFlag_out_of_order) {
        problem = "Collection_dates are out of order";
    }
    return problem;
}


string CSubSource::x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim)
{
    size_t pos = NStr::Find(orig_date, delim, NStr::eNocase);
    if (pos == NPOS) {
        return kEmptyStr;
    }
    size_t second_pos = NStr::Find(orig_date.substr(pos + 1), delim, NStr::eNocase);
    if (second_pos != NPOS) {
        return kEmptyStr;
    }
    bool month_ambig = false;
    string first_date = FixDateFormat(orig_date.substr(0, pos), true, month_ambig);
    if (month_ambig || NStr::IsBlank(first_date)) {
        return kEmptyStr;
    }
    string second_date = FixDateFormat(orig_date.substr(pos + delim.length()), true, month_ambig);
    if (month_ambig || NStr::IsBlank(second_date)) {
        return kEmptyStr;
    }
    string fix = first_date + "/" + second_date;
    return fix;
}


string CSubSource::FixDateFormat (const string& orig_date)
{
    bool month_ambiguous = false;

    string fix = FixDateFormat(orig_date, true, month_ambiguous);
    if (month_ambiguous) {
        fix.clear();
    } else if (NStr::IsBlank(fix)) {
        static const char* delimiters[] = {"/", " to ", " and ", "-", "_"};
        for (size_t i = 0; i < ArraySize(delimiters); i++) {
            fix = x_ParseDateRangeWithDelimiter(orig_date, delimiters[i]);
            if (!NStr::IsBlank(fix)) {
                break;
            }
        }
    }
    return fix;
}

// ISO Format for time is one of these:
// HH:MM:SS
// HH:MM
// HH
// Followed by either Z or +hh:mm to indicate an offset from Zulu
bool CSubSource::IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone)
{
    int offset_hour = 0;
    int offset_min = 0;
    size_t suffix = NStr::Find(orig_time, "Z");
    if (suffix == NPOS) {
        suffix = NStr::Find(orig_time, "+");
        if (suffix == NPOS) {
            if (require_time_zone) {
                return false;
            } else {
                suffix = orig_time.length();
            }
        } else {
            if (orig_time.substr(suffix).length() != 6 ||
                !isdigit((unsigned char)orig_time[suffix + 1]) ||
                !isdigit((unsigned char)orig_time[suffix + 2]) ||
                orig_time[suffix + 3] != ':' ||
                !isdigit((unsigned char)orig_time[suffix + 4]) ||
                !isdigit((unsigned char)orig_time[suffix + 5])) {
                return false;
            }
            try {
                offset_hour = NStr::StringToInt(orig_time.substr(suffix + 1, 2));
                offset_min = NStr::StringToInt(orig_time.substr(suffix + 4, 2));
            } catch (...) {
                return false;
            }
        }
    }
    if (suffix != 2 && suffix != 5 && suffix != 8) {
        return false;
    }

    if (!isdigit((unsigned char)orig_time[0]) || !isdigit((unsigned char)orig_time[1])) {
        return false;
    }
    hour = 0;
    min = 0;
    sec = 0;
    try {
        hour = NStr::StringToInt(orig_time.substr(0, 2));
        if (hour < 0 || hour > 23) {
            return false;
        }
        hour -= offset_hour;
    } catch (...) {
        return false;
    }
    if (suffix > 2) {
        if (!isdigit((unsigned char)orig_time[3]) || !isdigit((unsigned char)orig_time[4])) {
            return false;
        }
        try {
            min = NStr::StringToInt(orig_time.substr(3, 2));
            if (min < 0 || min > 59) {
                return false;
            }
        } catch (...) {
            return false;
        }
        min -= offset_min;
    }
    if (suffix == 8) {
        if (!isdigit((unsigned char)orig_time[6]) || !isdigit((unsigned char)orig_time[7])) {
            return false;
        }
        try {
            sec = NStr::StringToInt(orig_time.substr(6, 2));
            if (sec < 0) {
                // negative number bad
                return false;
            } else if (sec > 59) {
                // too big
                return false;
            }
        } catch (...) {
            return false;
        }
    }

    return true;
}

// ISO Format for date is exactly 10 characters long OR exactly 7 characters long.
// For ten characters:
// First four characters must be digits, represent year.
// Fifth character must be dash.
// Sixth and seventh characters must be digits, represent month, use zero padding.
// Eighth character must be dash.
// Ninth and tenth characters must be digits, represent day, use zero padding.
// For 7 characters:
// First four characters must be digits, represent year.
// Fifth character must be dash.
// Sixth and seventh characters must be digits, represent month, use zero padding.
bool CSubSource::IsISOFormatDateOnly (const string& cpy)
{
    if (cpy.length() != 10 && cpy.length() != 7) {
        return false;
    }
    bool rval = true;
    size_t pos = 0;
    string::const_iterator it = cpy.begin();
    while (it != cpy.end() && rval) {
        if (pos == 4 || pos == 7) {
            if (*it != '-') {
                rval = false;
            }
        } else if (!isdigit(*it)) {
            rval = false;
        }
        ++it;
        ++pos;
    }
    if (rval) {
        try {
            int year = NStr::StringToInt(cpy.substr(0, 4));
            int month = NStr::StringToInt(cpy.substr(5, 2));
            if (month < 1 || month > 12) {
                rval = false;
            }
            if (cpy.length() == 10) { // has day
                int day = NStr::StringToInt(cpy.substr(8, 2));
                if (!IsDayValueOkForMonth(day, month, year)) {
                    rval = false;
                }
            }
        } catch (...) {
            rval = false;
        }
    }
    return rval;
}


bool CSubSource::x_IsFixableIsoDate(const string& orig_date)
{
    string cpy = orig_date;
    NStr::TruncateSpacesInPlace(cpy);
    size_t time_pos = NStr::Find(cpy, "T");
    bool rval = false;
    if (time_pos == NPOS) {
        rval = false;
    } else {
        if (!IsISOFormatDateOnly(cpy.substr(0, time_pos))) {
            rval = false;
        } else {
            int h, m, s;
            if (IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, true)) {
                // already fine, not fixable
                rval = false;
            } else {
                rval = IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, false);
            }
        }
    }
    return rval;
}


string CSubSource::x_RemoveIsoTime(const string& orig_date)
{
    string cpy = orig_date;
    NStr::TruncateSpacesInPlace(cpy);
    size_t time_pos = NStr::Find(cpy, "T");
    if (time_pos != NPOS) {
        cpy = cpy.substr(0, time_pos);
    }
    return cpy;
}


bool CSubSource::IsISOFormatDate(const string& orig_date)
{
    string cpy = orig_date;
    NStr::TruncateSpacesInPlace(cpy);
    size_t time_pos = NStr::Find(cpy, "T");
    if (time_pos == NPOS) {
        return IsISOFormatDateOnly(cpy);
    } else {
        int h, m, s;
        return (IsISOFormatDateOnly(cpy.substr(0, time_pos)) &&
            IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s));
    }

}

CRef<CDate> CSubSource::GetDateFromISODate(const string& orig_date)
{
    try {
        string cpy = orig_date;
        NStr::TruncateSpacesInPlace(cpy);
        CRef<CDate> date(new CDate());
        int year_val = NStr::StringToInt(cpy.substr(0, 4));
        int month_val = NStr::StringToInt(cpy.substr(5, 2));
        date->SetStd().SetYear (year_val);
        date->SetStd().SetMonth (month_val);
        if (cpy.length() > 7) {
            int day_val = NStr::StringToInt(cpy.substr(8, 2));
            date->SetStd().SetDay (day_val);
        }
        return date;
    } catch (...) {
        return CRef<CDate>();
    }
}


vector<string> CSubSource::x_GetDateTokens(const string& orig_date)
{
    vector<string> tokens;
    string token_delimiters = " ,-/=_.";

    string cpy = orig_date;
    NStr::TruncateSpacesInPlace (cpy);

    string curr_token;
    bool is_chars = false;
    ITERATE(string, s, cpy) {
        if (token_delimiters.find(*s) != NPOS) {
            if (!NStr::IsBlank(curr_token)) {
                tokens.push_back(curr_token);
            }
            curr_token.clear();
            is_chars = false;
        } else if (is_chars && !isalpha((unsigned char)(*s))) {
            // previous token was all letters, do not add non-letter characters
            if (!NStr::IsBlank(curr_token)) {
                tokens.push_back(curr_token);
            }
            curr_token = *s;
            is_chars = false;
        } else if (!NStr::IsBlank(curr_token) && !is_chars && isalpha(*s)) {
            // previous token had no letters
            tokens.push_back(curr_token);
            curr_token = *s;
            is_chars = true;
        } else {
            curr_token += *s;
            if (isalpha(*s)) {
                is_chars = true;
            }
        }
    }
    if (!NStr::IsBlank(curr_token)) {
        tokens.push_back(curr_token);
    }

    // reattach 'st', 'nd', 'rd', and 'th' to numbers if present
    if (tokens.size() > 3) {
        vector<string>::iterator p = tokens.begin();
        bool prev_is_number = isdigit((unsigned char)(*p)[0]);
        vector<string>::iterator s = p;
        ++s;
        while (s != tokens.end()) {
            if (prev_is_number &&
                (NStr::EqualNocase(*s, "st") ||
                NStr::EqualNocase(*s, "nd") ||
                NStr::EqualNocase(*s, "rd") ||
                NStr::EqualNocase(*s, "th"))) {
                *p += *s;
                s = tokens.erase(s);
                prev_is_number = false;
            } else {
                ++p;
                ++s;
                prev_is_number = isdigit((unsigned char)(*p)[0]);
            }
        }
    }

    return tokens;
}


bool s_ChooseMonthAndDay(const string& token1, const string& token2, bool month_first, string& month, int& day, bool& month_ambiguous)
{
    try {
        int val1 = NStr::StringToInt (token1);
        int val2 = NStr::StringToInt (token2);
        if (val1 > 12 && val2 > 12) {
            // both numbers too big for month
            return false;
        } else if (val1 < 13 && val2 < 13) {
            if (val1 == val2) {
                // no need to call this ambiguous
                month = CTime::MonthNumToName(val1, CTime::eAbbr);
                day = val2;
            } else {
                // both numbers could be month
                month_ambiguous = true;
                if (month_first) {
                    month = CTime::MonthNumToName(val1, CTime::eAbbr);
                    day = val2;
                } else {
                    month = CTime::MonthNumToName(val2, CTime::eAbbr);
                    day = val1;
                }
            }
        } else if (val1 < 13) {
            month = CTime::MonthNumToName(val1, CTime::eAbbr);
            day = val2;
        } else {
            month = CTime::MonthNumToName(val2, CTime::eAbbr);
            day = val1;
        }
        return true;
    } catch ( ... ) {
        return false;
    }
}


string CSubSource::FixDateFormat (const string& test, bool month_first, bool& month_ambiguous)
{
    string orig_date = test;
    NStr::TruncateSpacesInPlace(orig_date);

    if (IsISOFormatDate(orig_date)) {
        return orig_date;
    } else if (x_IsFixableIsoDate(orig_date)) {
        return x_RemoveIsoTime(orig_date);
    }

    string reformatted_date;
    string month;
    int year = 0, day = 0;
    //string token_delimiters = " ,-/=_.";
    size_t num_original_tokens = 0;

    month_ambiguous = false;
    vector<string> tokens = x_GetDateTokens(orig_date);

    num_original_tokens = tokens.size();
    if (tokens.size() < 1 || tokens.size() > 3) {
        // no tokens or too many tokens
        return kEmptyStr;
    }

    string one_token;
    vector<string>::iterator it = tokens.begin();
    while (it != tokens.end()) {
        one_token = *it;
        bool found = false;
        if (NStr::EqualNocase(one_token, "1st") || NStr::EqualNocase(one_token, "first")) {
            day = 1;
            found = true;
        } else if (NStr::EqualNocase(one_token, "2nd") || NStr::EqualNocase(one_token, "second")) {
            day = 2;
            found = true;
        } else if (NStr::EqualNocase(one_token, "3rd") || NStr::EqualNocase (one_token, "third")) {
            day = 3;
            found = true;
        } else if (one_token.length() > 0
                   && isdigit((unsigned char)one_token[0])
                   && NStr::EndsWith(one_token, "th")) {
            try {
                day = NStr::StringToInt (one_token.substr(0, one_token.length() - 2));
                found = true;
            } catch ( ... ) {
                // threw exception while converting to int
                return kEmptyStr;
            }
        } else if (isalpha((unsigned char)one_token[0])) {
            if (!NStr::IsBlank(month)) {
                // already have month, error
                return kEmptyStr;
            }
            if (one_token.length() > 3) {
                one_token = one_token.substr(0, 3);
            }
            try {
                int month_num = CTime::MonthNameToNum(one_token);
                found = true;
                month = CTime::MonthNumToName(month_num, CTime::eAbbr);
            } catch (const CTimeException&) {
            }
        } else {
            try {
                int this_val = NStr::StringToInt (one_token);
                int min = 1;
                int max = 31;
                if (this_val < min) {
                    return kEmptyStr;
                } else if (this_val > max) {
                    if (year > 0) {
                        // already have year, error
                        return kEmptyStr;
                    }
                    year = this_val;
                    found = true;
                }
            } catch ( ... ) {
                // threw exception while converting to int
                return kEmptyStr;
            }
        }
        if (found) {
            it = tokens.erase(it);
        } else {
            it++;
        }
    }

    if (tokens.size() == 0) {
        // good - all tokens assigned to values
    } else if (tokens.size() > 2) {
        // three numbers: treat last one as year
        try {
            year = NStr::StringToInt(tokens[2]);
            if (year < 100) {
                year += 2000;
            }
            if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
                return kEmptyStr;
            }
            // mark month as ambiguous, since we are guessing about year
            month_ambiguous = true;
        } catch ( ... ) {
            // threw exception while converting to int
            return kEmptyStr;
        }
    } else if (tokens.size() == 1) {
        try {
            int val = NStr::StringToInt (tokens[0]);
            if (year == 0) {
                year = val;
            } else {
                if (NStr::IsBlank (month)) {
                    if (val > 0 && val < 13) {
                        month = CTime::MonthNumToName(val, CTime::eAbbr);
                    } else {
                        // month number out of range
                        return kEmptyStr;
                    }
                } else {
                    day = val;
                }
            }
        } catch ( ... ) {
            // threw exception while converting to int
            return kEmptyStr;
        }
    } else if (!NStr::IsBlank (month)) {
        if (tokens.size() == 2) {
            // we have a month and two other numbers (we hope)
            int val1 = 0;
            int val2 = 0;
            try {
                val1 = NStr::StringToInt (tokens[0]);
                val2 = NStr::StringToInt (tokens[1]);
            } catch (CException& /*e*/) {
                // not actually numbers
                return kEmptyStr;
            }
            bool zero_pad_1 = NStr::StartsWith(tokens[0], "0");
            bool zero_pad_2 = NStr::StartsWith(tokens[1], "0");
            if (val1 < 10 && !zero_pad_1 && (val2 > 10 || zero_pad_2)) {
                // if one token is not zero-padded and less than 10,
                // the other either is zero-padded and greater than 10,
                // the "small" token is the day and the second (+2000) is the year
                day = val1;
                year = val2 + 2000;
            } else if (val2 < 10 && !zero_pad_2 && (val1 > 10 || zero_pad_1)) {
                // if one token is not zero-padded and less than 10,
                // the other either is zero-padded and greater than 10,
                // the "small" token is the day and the second (+2000) is the year
                day = val2;
                year = val1 + 2000;
            } else {
                int month_num = CTime::MonthNameToNum(month);
                if (IsDayValueOkForMonth(val1, month_num, val2 + 2000)) {
                    day = val1;
                    year = val2 + 2000;
                } else {
                    day = val2;
                    year = val1 + 2000;
                }
            }
        } else {
            return kEmptyStr;
        }
    } else {
        if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
            return kEmptyStr;
        }
    }

    // make sure day is valid
    if (day > 0 && !NStr::IsBlank(month) && year > -1) {
        try {
            int month_num = CTime::MonthNameToNum(month);
            if (!IsDayValueOkForMonth(day, month_num, year)) {
                return kEmptyStr;
            }
        } catch (const CTimeException&) {
            return kEmptyStr;
        }
    }

    if (year > 0 && year < 100 && num_original_tokens > 1) {
        // try to guess year from two-digit year provided,
        // only if it could not possibly be a day of the month
        // and if there were at least two tokens provided
        string year_date = NStr::NumericToString(year + 2000);
        bool format_bad = false;
        bool in_future = false;
        IsCorrectDateFormat(year_date, format_bad, in_future);
        if (in_future) {
            year += 1900;
        } else {
            year += 2000;
        }
    }
    if (year >= 1000 && year < 2100) {
        reformatted_date = NStr::NumericToString (year);
        if (!NStr::IsBlank (month)) {
            reformatted_date = month + "-" + reformatted_date;
            if (day > 0) {
                string day_str = NStr::NumericToString (day);
                if (day_str.length() < 2) {
                    day_str = "0" + day_str;
                }
                reformatted_date = day_str + "-" + reformatted_date;
            }
        }
    }

    return reformatted_date;
}


void CSubSource::DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first)
{
    ambiguous = false;
    day_first = false;
    vector<string> tokens = x_GetDateTokens(orig_date);
    if (tokens.size() != 3) {
        // can't do detection if there are more or less than three tokens
        ambiguous = true;
        return;
    }
    vector<int> nums;

    // detection is only valid if all tokens are numbers and at least one is known to be the year
    try {
        ITERATE(vector<string>, it, tokens) {
            nums.push_back(NStr::StringToInt (*it));
        }
    } catch ( ... ) {
        // threw exception while converting to int
        ambiguous = true;
        return;
    }
    enum EPos { eDay = 0, eMonth = 1, eYear = 2 };
    vector<int> positions;
    positions.push_back(0);
    positions.push_back(0);
    positions.push_back(0);

    int token_pos = 1;
    ITERATE(vector<int>, it, nums) {
        if (*it > 31) {
            if (positions[eYear] > 0) {
                // already found a year
                ambiguous = true;
                return;
            }
            positions[eYear] = token_pos;
        } else if (*it > 12) {
            if (positions[eDay] > 0) {
                // already found a day
                ambiguous = true;
                return;
            }
            positions[eDay] = token_pos;
        } else if (positions[eMonth] > 0) {
            // already found a month
            ambiguous = true;
            return;
        } else {
            positions[eMonth] = token_pos;
        }
        token_pos++;
    }
    if (positions[eDay] < positions[eMonth]) {
        day_first = true;
    } else {
        day_first = false;
    }
}


void CSubSource::IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
                                     bool& lat_in_range, bool& lon_in_range,
                                     double& lat_value, double& lon_value)
{
    format_correct = false;
    lat_in_range = false;
    lon_in_range = false;
    precision_correct = false;
    double ns, ew;
    char lon, lat;
    int processed;

    lat_value = 0.0;
    lon_value = 0.0;

    if (NStr::IsBlank(lat_lon)) {
        return;
    } else if (sscanf (lat_lon.c_str(), "%lf %c %lf %c%n", &ns, &lat, &ew, &lon, &processed) != 4
               || size_t(processed) != lat_lon.length()) {
        return;
    } else if ((lat != 'N' && lat != 'S') || (lon != 'E' && lon != 'W')) {
        return;
    } else {
        // init values found
        if (lat == 'N') {
            lat_value = ns;
        } else {
            lat_value = 0.0 - ns;
        }
        if (lon == 'E') {
            lon_value = ew;
        } else {
            lon_value = 0.0 - ew;
        }

        // make sure format is correct
        vector<string> pieces;
        NStr::Split(lat_lon, " ", pieces);
        if (pieces.size() > 3) {
            int precision_lat = x_GetPrecision(pieces[0]);
            int precision_lon = x_GetPrecision(pieces[2]);

            char reformatted[1000];
            sprintf (reformatted, "%.*lf %c %.*lf %c", precision_lat, ns, lat,
                                                       precision_lon, ew, lon);

            size_t len = strlen (reformatted);
            if (NStr::StartsWith(lat_lon, reformatted)
                && (len == lat_lon.length()
                  || (len < lat_lon.length()
                      && lat_lon[len] == ';'))) {
                format_correct = true;
                if (ns <= 90 && ns >= 0) {
                    lat_in_range = true;
                }
                if (ew <= 180 && ew >= 0) {
                    lon_in_range = true;
                }
                if (precision_lat < 3 && precision_lon < 3) {
                    precision_correct = true;
                }
            }
        }
    }
}


string CSubSource::FixLatLonPrecision(const string& orig)
{
    bool format_correct = false;
    bool precision_correct = false;
    bool lat_in_range = false;
    bool lon_in_range = false;
    double lat_value = 0.0;
    double lon_value = 0.0;
    IsCorrectLatLonFormat(orig, format_correct, precision_correct,
                          lat_in_range, lon_in_range,
                          lat_value, lon_value);
    if (!format_correct || !lat_in_range || !lon_in_range || precision_correct) {
        return orig;
    }
    vector<string> pieces;
    NStr::Split(orig, " ", pieces);
    if (pieces.size() > 3) {
        int precision_lat = x_GetPrecision(pieces[0]);
        int precision_lon = x_GetPrecision(pieces[2]);
        if (precision_lat > 4) {
            precision_lat = 4;
        }
        if (precision_lon > 4) {
            precision_lon = 4;
        }

        char reformatted[1000];
        sprintf(reformatted, "%.*lf %c %.*lf %c", precision_lat, fabs(lat_value), pieces[1].c_str()[0],
            precision_lon, fabs(lon_value), pieces[3].c_str()[0]);
        string new_val = reformatted;
        return reformatted;
    }
    return kEmptyStr;
}

/*
1. String should be converted to UTF8 string, this will get rid of \xC0 and similar substrings
2. Every codepoint (note that this is not regular ascii "char") that is not a digit or a decimal point or a letter should be prepended with a space.
   Transitions from alpha to digit/point and from digit/point to alpha should also be prepended with a space.
3. NStr::Split is called with space as a separator and Tokenize flag - need to check if Split works with UTF8 strings properly.
4. After this we should have a vector of tokens, some of which are numbers and others are "modifiers" such as ', '', degrees, N, S, E, W, etc.
5. A pattern string is created where each number is replaced with "1" and modifiers are normalized to "lat", or "N"; the actual numerical values are kept in a separate vector
5. Based on the pattern the vector of numbers is parsed into degrees, minutes, or seconds,
6. NSEW and "lattitude/longitude" are applied to degrees in the order of appearance, if none are present other heuristic to determine which is latitude and which is longitude
*/

static string s_InsertSpacesBetweenTokens(const string &old_str)
{
    string new_str;
    for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
    {
        TUnicodeSymbol sym = CUtf8::Decode(i);
        if (sym < 0x80)
        {
            char c = static_cast<char>(sym);
            if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
            {
                new_str += ' ';
            }
            else if (!new_str.empty() &&
                 ((isalpha(new_str.back()) && !isalpha(c)) ||
                  (!isalpha(new_str.back()) && isalpha(c))))
            {
                new_str += ' ';
            }
            new_str += c;
            if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
            {
                new_str += ' ';
            }
        }
        else
        {
            new_str += ' ';
        }
    }
    return new_str;
}

static string s_RemoveSpacesWithinNumbers(const string &old_str)
{
    string new_str;
    bool is_number = true;
    for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
    {
        TUnicodeSymbol sym = CUtf8::Decode(i);
        if (sym < 0x80)
        {
            char c = static_cast<char>(sym);
                size_t j = new_str.size();
                if (j >= 4 &&  new_str[j-1] == ' ' && new_str[j-2] == '.' && new_str[j-3] == ' ' && isdigit(new_str[j-4]) && isdigit(c))
                {
                    new_str.pop_back();
                    new_str.pop_back();
                    new_str.pop_back();
                    new_str += '.';
                }
                new_str += c;
                if (!isdigit(c) && c != '+' && c != '-' && c != '.' && !isspace(c)) {
                    is_number = false;
                }
            }
        else
        {
            new_str += ' ';
            is_number = false;
        }
    }
    if (is_number)
    {
        NStr::ReplaceInPlace(new_str, "+", " +");
        NStr::ReplaceInPlace(new_str, "-", " -");
    }
    return new_str;
}

static bool s_IsNumber(const string &token, double *result = NULL)
{
    double num = NStr::StringToDouble(token, NStr::fConvErr_NoThrow);
    if (!num && errno)
    {
        return false;
    }
    if (result) {
        *result = num;
    }
    return true;
}

static string s_NormalizeTokens(vector<string> &tokens, vector<double> &numbers, vector<string> &anum, vector<int> &precision, vector<string> &lat_long,  vector<string> &nsew)
{
    vector<string> pattern;
    for (size_t i = 0; i < tokens.size(); i++)
    {
        string &token = tokens[i];

        double num;
        if (s_IsNumber(token, &num))
        {
            numbers.push_back(num);
            anum.push_back(token);
            pattern.push_back("1");
            precision.push_back(0);
            if (NStr::Find(token, ".") != NPOS && !NStr::EndsWith(token, "."))
            {
                precision.back()
                    = static_cast<int>(token.length() - token.find('.') - 1);
            }
            continue;
        }

        {
            vector<string> tmp;
            NStr::Split(token, ".", tmp);
            double num0, num1, num2;
            if (tmp.size() == 3 && s_IsNumber(tmp[0], &num0) && s_IsNumber(tmp[1], &num1) && s_IsNumber(tmp[2], &num2))
            {
                numbers.push_back(num0);
                anum.push_back(tmp[0]);
                pattern.push_back("1");
                precision.push_back(0);
                numbers.push_back(num1);
                anum.push_back(tmp[1]);
                pattern.push_back("1");
                precision.push_back(0);
                numbers.push_back(num2);
                anum.push_back(tmp[2]);
                pattern.push_back("1");
                precision.push_back(0);
                continue;
            }
        }

        if (token == "\'" && i >= 3 && s_IsNumber(tokens[i - 1]) && tokens[i - 2] == "\'" && s_IsNumber(tokens[i - 3]))
        {
            token = "\"";
        }

        if (NStr::EqualNocase(token, "degrees") || NStr::EqualNocase(token, "deg")  || NStr::EqualNocase(token, "deg.") || NStr::EqualNocase(token, "degree"))
        {
            token = "degrees";
            pattern.push_back("degrees");
        }
        else if ( token == "\'"  || NStr::EqualNocase(token, "min") || NStr::EqualNocase(token, "min.") || NStr::EqualNocase(token, "minute") || NStr::EqualNocase(token, "minutes"))
        {
            token  = "\'";
            pattern.push_back("\'");
        }
        else if (token == "\"" || NStr::EqualNocase(token, "sec") || NStr::EqualNocase(token, "sec.") || NStr::EqualNocase(token, "second") || NStr::EqualNocase(token, "seconds"))
        {
            token = "\"";
            pattern.push_back("\"");
        }
        else if (token == "," || token == ":" || token == "_" || token == "&" || token == "." || token == ";" || token == "#" || NStr::EqualNocase(token, "and"))
        {
        }
        else if (NStr::EqualNocase(token, "lattitude") || NStr::EqualNocase(token, "latitude") || NStr::EqualNocase(token, "lat") || NStr::EqualNocase(token, "lat."))
        {
            pattern.push_back("lat");
            lat_long.push_back("lat");
        }
        else if (NStr::EqualNocase(token, "longitude") || NStr::EqualNocase(token, "lo") || NStr::EqualNocase(token, "lon") || NStr::EqualNocase(token, "long")
                     || NStr::EqualNocase(token, "lo.") || NStr::EqualNocase(token, "lon.") || NStr::EqualNocase(token, "long."))
        {
            pattern.push_back("lat");
            lat_long.push_back("long");
        }
        else if (token == "N"  || NStr::EqualNocase(token, "north"))
        {
            pattern.push_back("N");
            nsew.push_back("N");
        }
        else if (token == "S"  || NStr::EqualNocase(token, "south"))
        {
            pattern.push_back("N");
            nsew.push_back("S");
        }
        else if (token == "E"  || NStr::EqualNocase(token, "east"))
        {
            pattern.push_back("N");
            nsew.push_back("E");
        }
        else if (token == "W"  || NStr::EqualNocase(token, "west") || token == "Wdeg")
        {
            pattern.push_back("N");
            nsew.push_back("W");
        }
        else if (token == "NW")
        {
            nsew.push_back("N");
            nsew.push_back("W");
        }
        else if (token == "NE")
        {
            nsew.push_back("N");
            nsew.push_back("E");
        }
        else if (token == "SW")
        {
            nsew.push_back("S");
            nsew.push_back("W");
        }
        else if (token == "SE")
        {
            nsew.push_back("S");
            nsew.push_back("E");
        }
        else
        {
            //cout << "Token: " << token << endl;
            numbers.clear();
            return kEmptyStr;
        }
    }
    //cout << "Pattern: " << NStr::Join(pattern, " ") << endl;
    return NStr::Join(pattern, " ");
}

static void s_ReorderNorthSouthEastWest(vector<double> &numbers, vector<int> &precision, const vector<string> &lat_long, vector<string> &nsew)
{
    if (numbers.size() != 2)
    {
        numbers.clear();
        return;
    }
    if (lat_long.size() == 2)
    {
        if (lat_long.front() == "long")
        {
            swap(numbers[0], numbers[1]);
            swap(precision[0], precision[1]);
            if (nsew.size() == 2) {
                swap(nsew[0], nsew[1]);
            }
        }
    }
    else if (!lat_long.empty())
    {
        numbers.clear();
        return;
    }
    if (nsew.size() == 2)
    {
        if ((nsew[0] == "E" || nsew[0] == "W") &&
            (nsew[1] == "N" || nsew[1] == "S"))
        {
            swap(numbers[0], numbers[1]);
            swap(precision[0], precision[1]);
            swap(nsew[0], nsew[1]);
        }
        if (nsew[0] == "N")
        {
        numbers[0] = fabs(numbers[0]);
        }
        else if (nsew[0] == "S")
        {
            if (numbers[0] != 0)
                numbers[0] = -fabs(numbers[0]);
        }
        else
        {
            numbers.clear();
            return;
        }
        if (nsew[1] == "E")
        {
            numbers[1] = fabs(numbers[1]);
        }
        else if (nsew[1] == "W")
        {
            if (numbers[1] != 0)
                numbers[1] = -fabs(numbers[1]);
        }
        else
        {
            numbers.clear();
            return;
        }

    }
    else if (!nsew.empty())
    {
        numbers.clear();
        return;
    }
    if (lat_long.empty() && nsew.empty() && fabs(numbers[0]) > 90 && fabs(numbers[1]) < 90)
    {
        swap(numbers[0], numbers[1]);
        swap(precision[0], precision[1]);
    }
    if (fabs(numbers[0]) > 90 || fabs(numbers[1]) > 180)
    {
        numbers.clear();
        return;
    }
}

static void s_GetLatLong(const string &new_str, vector<double> &numbers, vector<int> &precision)
{
    vector<string> tokens;
    NStr::Split(new_str, " ", tokens, NStr::fSplit_Tokenize);
    vector<string> lat_long;
    vector<string> nsew;
    vector<string> anum;
    string pattern = s_NormalizeTokens(tokens, numbers, anum, precision, lat_long, nsew);
    if (pattern.empty())
    {
        numbers.clear();
        return;
    }
    vector<double> degrees(2, 0);
    vector<int> prec(2, 0);
    int sign1 = 1;
    int sign2 = 1;
    if ( pattern == "1 1" ||
     pattern == "1 N 1 N" ||
         pattern == "N 1 N 1" ||
     pattern == "1 degrees N 1 degrees N" ||
     pattern == "lat 1 lat 1" ||
         pattern == "1 N lat 1 N lat" ||
         pattern == "1 degrees N lat 1 degrees N lat")
    {
        degrees[0] = numbers[0];
        degrees[1] = numbers[1];
        prec[0] = precision[0];
        prec[1] = precision[1];
    }
    else if ((pattern == "1 1 \" 1 1 '" ||
          pattern == "1 degrees 1 \" N 1 degrees 1 ' N")
         && numbers[1] < 60 && numbers[3] < 60
             && numbers[1] >= 0 && numbers[3] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        sign2 = anum[2][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 3600);
        degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
        prec[0] = max(precision[0], precision[1] + 4);
        prec[1] = max(precision[2], precision[3] + 2);
    }
    else if ( (pattern == "1 1 ' 1" ||
               pattern == "1 degrees 1 ' N 1 degrees N")
              && numbers[1] < 60
              && numbers[1] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
        degrees[1] = numbers[2];
        prec[0] = max(precision[0], precision[1] + 2);
        prec[1] = precision[2];
        }
    else if (pattern == "1 1 ' 1 \" 1"
         && numbers[1] < 60 && numbers[2] < 60
             && numbers[1] >= 0 && numbers[2] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
        degrees[1] = numbers[3];
        prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
        prec[1] = precision[3];
    }
    else if ((pattern == "1 1 ' 1 \" 1 1 '" ||
          pattern == "1 1 1 N 1 1 N" ||
          pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' N")
         && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
             && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        sign2 = anum[3][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
        degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60);
        prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
        prec[1] = max(precision[3], precision[4] + 2);
    }
    else if (( pattern == "1 1 ' 1 \" 1 1 ' 1 \"" ||
           pattern == "1 1 ' 1 \" N 1 1 ' 1 \" N" ||
           pattern == "1 degrees 1 ' 1 \" 1 degrees 1 ' 1 \"" ||
           pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \" N" ||
           pattern == "N 1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \"" ||
           pattern == "1 degrees 1 ' 1 N 1 degrees 1 ' 1 N" ||
           pattern == "1 degrees 1 1 N 1 degrees 1 1 N" ||
           pattern == "1 1 1 N 1 1 1 N")
             && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60 && numbers[5] < 60
             && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0 && numbers[5] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        sign2 = anum[3][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
        degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60 + numbers[5] / 3600);
        prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
        prec[1] = max(max(precision[3], precision[4] + 2), precision[5] + 4);
    }
    else if (( pattern == "1 1 ' 1 1 '" ||
           pattern == "1 1 N 1 1 N" ||
               pattern == "1 1 ' N 1 1 ' N" ||
           pattern == "1 degrees 1 ' N 1 degrees 1 ' N" ||
               pattern == "lat 1 degrees 1 ' N lat 1 degrees 1 ' N" ||
           pattern == "1 degrees 1 N 1 degrees 1 N" ||
           pattern == "1 degrees 1 N 1 degrees 1 ' N" ||
               pattern == "1 degrees 1 ' N 1 degrees 1 N" ||
               pattern == "N 1 degrees 1 ' N 1 degrees 1" ||
               pattern == "N 1 degrees 1 ' N 1 degrees 1 '" ||
               pattern == "N 1 degrees 1 ' N 1 1 '")
         && numbers[1] < 60  && numbers[3] < 60
             && numbers[1] >= 0  && numbers[3] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        sign2 = anum[2][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
        degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
        prec[0] = max(precision[0], precision[1] + 2);
        prec[1] = max(precision[2], precision[3] + 2);
    }
    else if ((pattern == "1 N 1 1 N" ||
              pattern == "1 degrees N 1 degrees 1 ' N")
         &&  numbers[2] < 60
             &&  numbers[2] >= 0)
    {
        sign2 = anum[1][0] == '-' ? -1 : 1;
        degrees[0] = numbers[0];
        degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60);
        prec[0] = precision[0];
        prec[1] = max(precision[1], precision[2] + 2);
    }
    else if ((pattern == "1 degrees 1 ' 1 degrees 1 ' 1 \"" ||
              pattern == "N 1 1 N 1 1 1")
         && numbers[1] < 60 && numbers[3] < 60 && numbers[4] < 60
             && numbers[1] >= 0 && numbers[3] >= 0 && numbers[4] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        sign2 = anum[2][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
        degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60 + numbers[4] / 3600);
        prec[0] = max(precision[0], precision[1] + 2);
        prec[1] = max(max(precision[2], precision[3] + 2), precision[4] + 4);
    }
    else if (pattern == "1 degrees 1 degrees 1 ' 1 \""
         && numbers[2] < 60 && numbers[3] < 60
             && numbers[2] >= 0 && numbers[3] >= 0)
    {
        sign2 = anum[1][0] == '-' ? -1 : 1;
        degrees[0] = numbers[0];
        degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60 + numbers[3] / 3600);
        prec[0] = precision[0];
        prec[1] = max(max(precision[1], precision[2] + 2), precision[3] + 4);
    }
    else if (pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 \" N"
         && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
             && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
    {
        sign1 = anum[0][0] == '-' ? -1 : 1;
        sign2 = anum[3][0] == '-' ? -1 : 1;
        degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
        degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 3600);
        prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
        prec[1] = max(precision[3], precision[4] + 4);
    }
    else
    {
        degrees.clear();
        prec.clear();
    }
    swap(degrees, numbers);
    swap(prec, precision);
    s_ReorderNorthSouthEastWest(numbers, precision, lat_long, nsew);
}


string s_ShortenLatLon( string &subname ) {
    string lat;
    string north_or_south;
    string lon;
    string east_or_west;

    if (subname.length() < 1) {
       return subname;
    }
    char ch = subname[0];
    if (ch < '0' || ch > '9') {
        return subname;
    }

    // extract the pieces
    CNcbiIstrstream lat_lon_stream( subname );
    lat_lon_stream >> lat;
    lat_lon_stream >> north_or_south;
    lat_lon_stream >> lon;
    lat_lon_stream >> east_or_west;
    if( lat_lon_stream.bad() ) {
        return subname;
    }

    if( north_or_south != "N" && north_or_south != "S" ) {
        return subname;
    }

    if( east_or_west != "E" && east_or_west != "W" ) {
        return subname;
    }

    size_t pos = NStr::Find(lat, ".");
    if (pos > 0) {
        size_t len = lat.length();
        if (pos + 9 < len) {
            lat.erase(pos + 9);
        }
    }

    pos = NStr::Find(lon, ".");
    if (pos > 0) {
        size_t len = lon.length();
        if (pos + 9 < len) {
            lon.erase(pos + 9);
        }
    }

    return lat + " " + north_or_south + " " + lon + " " + east_or_west;
}

string CSubSource::FixLatLonFormat (string orig_lat_lon, bool guess)
{
    //cout << "Before: " << orig_lat_lon << endl;
    NStr::ParseEscapes(orig_lat_lon);
    CStringUTF8 old_str = CUtf8::AsUTF8(orig_lat_lon, CUtf8::GuessEncoding(orig_lat_lon));
    if (NStr::StartsWith(old_str, "\""))
    {
        NStr::TrimPrefixInPlace(old_str, "\"");
        NStr::TrimSuffixInPlace(old_str, "\"");
    }
    NStr::ReplaceInPlace(old_str, "\'\'", "\"");
    string fixed_str = s_RemoveSpacesWithinNumbers(old_str);
    string new_str = s_InsertSpacesBetweenTokens(fixed_str);
    NStr::Sanitize(new_str);
    vector<double> numbers;
    vector<int> precision;
    s_GetLatLong(new_str, numbers, precision);
    string res;
    if (!numbers.empty())
    {
        res = MakeLatLon(numbers[0], numbers[1], precision[0], precision[1]);
    }
    //cout << "After: " << res << endl;
    res = s_ShortenLatLon(res);
    return res;
}


string CSubSource::MakeLatLon(double lat_value, double lon_value, int lat_precision, int lon_precision )
{
    char ns = 'N';
    if (lat_value < 0) {
        ns = 'S';
        lat_value = -lat_value;
    }
    char ew = 'E';
    if (lon_value < 0) {
        ew = 'W';
        lon_value = -lon_value;
    }
    string lat = NStr::DoubleToString(lat_value, lat_precision);
    string lon = NStr::DoubleToString(lon_value, lon_precision);

    NStr::TrimSuffixInPlace(lat, ".");
    NStr::TrimSuffixInPlace(lon, ".");
    string res = lat + " " + ns + " " + lon + " " + ew;
    return res;
}


CLatLonCountryId *CSubSource::x_CalculateLatLonId(float lat_value, float lon_value, string country, string province)
{
    CLatLonCountryId *id = new CLatLonCountryId(lat_value, lon_value);

    bool goodmatch = false;

    // lookup region by coordinates, or find nearest region and calculate distance
    const CCountryExtreme * guess = m_LatLonCountryMap->GuessRegionForLatLon(lat_value, lon_value, country, province);
    if (guess) {
        id->SetFullGuess(guess->GetCountry());
        id->SetGuessCountry(guess->GetLevel0());
        id->SetGuessProvince(guess->GetLevel1());
        if (NStr::EqualNocase(country, id->GetGuessCountry())
            && (NStr::IsBlank(province) || NStr::EqualNocase(province, id->GetGuessProvince()))) {
            goodmatch = true;
        }
    } else {
        // not inside a country, check water
        guess = m_LatLonWaterMap->GuessRegionForLatLon(lat_value, lon_value, country);
        if (guess) {
            // found inside water
            id->SetGuessWater(guess->GetCountry());
            if (NStr::EqualNocase(country, id->GetGuessWater())) {
                goodmatch = true;
            }

            // also see if close to land for coastal warning (if country is land)
            // or proximity message (if country is water)
            double landdistance = 0.0;
            guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
            if (guess) {
                id->SetClosestFull(guess->GetCountry());
                id->SetClosestCountry(guess->GetLevel0());
                id->SetClosestProvince(guess->GetLevel1());
                id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
                if (NStr::EqualNocase(country, id->GetClosestCountry())
                    && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
                    goodmatch = true;
                }
            }
        } else {
            // may be coastal inlet, area of data insufficiency
            double landdistance = 0.0;
            guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
            if (guess) {
                id->SetClosestFull(guess->GetCountry());
                id->SetClosestCountry(guess->GetLevel0());
                id->SetClosestProvince(guess->GetLevel1());
                id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
                if (NStr::EqualNocase(country, id->GetClosestCountry())
                     && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
                    goodmatch = true;
                }
            }

            double waterdistance = 0.0;
            guess = m_LatLonWaterMap->FindClosestToLatLon (lat_value, lon_value, 5.0, waterdistance);
            if (guess) {
                id->SetClosestWater(guess->GetLevel0());
                id->SetWaterDistance(m_LatLonWaterMap->AdjustAndRoundDistance (waterdistance));
                if (NStr::EqualNocase(country, id->GetClosestWater())) {
                    goodmatch = true;
                }
            }
        }
    }

    // if guess is not the provided country or province, calculate distance to claimed country
    if (!goodmatch) {
        double distance = 0.0;
        guess = m_LatLonCountryMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
        if (guess) {
            if (distance < ErrorDistance(lat_value, lon_value, m_LatLonCountryMap->GetScale())) {
                // close enough
                id->SetGuessCountry(country);
                id->SetGuessProvince(province);
                id->SetFullGuess(guess->GetCountry());
            } else {
                id->SetClaimedFull(guess->GetCountry());
                id->SetClaimedDistance(m_LatLonCountryMap->AdjustAndRoundDistance (distance));
            }
        } else if (NStr::IsBlank(province)) {
            guess = m_LatLonWaterMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
            if (guess) {
                id->SetClaimedFull(guess->GetCountry());
                id->SetClaimedDistance(m_LatLonWaterMap->AdjustAndRoundDistance (distance));
            }
        }
    }

    return id;
}



typedef SStaticPair<const char*, const char*>  TWaterPairElem;
static const TWaterPairElem k_water_pair_map[] = {
    {"Adriatic Sea",         "Mediterranean Sea"},
    {"Aegean Sea",           "Mediterranean Sea"},
    {"Alboran Sea",          "Mediterranean Sea"},
    {"Andaman Sea",          "Indian Ocean"},
    {"Arabian Sea",          "Indian Ocean"},
    {"Argentine Sea",        "Atlantic Ocean"},
    {"Ariake Sea",           "Pacific Ocean"},
    {"Baffin Bay",           "Atlantic Ocean"},
    {"Balearic Sea",         "Mediterranean Sea"},
    {"Baltic Sea",           "Atlantic Ocean"},
    {"Barents Sea",          "Arctic Ocean"},
    {"Bay of Bengal",        "Indian Ocean"},
    {"Beaufort Sea",         "Arctic Ocean"},
    {"Bering Sea",           "Pacific Ocean"},
    {"Bismarck Sea",         "Pacific Ocean"},
    {"Black Sea",            "Mediterranean Sea"},
    {"Bohai Sea",            "Pacific Ocean"},
    {"Caribbean Sea",        "Atlantic Ocean"},
    {"Celebes Sea",          "Pacific Ocean"},
    {"Champlain Sea",        "Atlantic Ocean"},
    {"Chilean Sea",          "Pacific Ocean"},
    {"China Seas",           "Pacific Ocean"},
    {"Chukchi Sea",          "Arctic Ocean"},
    {"Coral Sea",            "Pacific Ocean"},
    {"Davis Strait",         "Atlantic Ocean"},
    {"East China Sea",       "Pacific Ocean"},
    {"East Siberian Sea",    "Arctic Ocean"},
    {"English Channel",      "Atlantic Ocean"},
    {"Erythraean Sea",       "Indian Ocean"},
    {"Golfo de California",  "Pacific Ocean"},
    {"Greenland Sea",        "Arctic Ocean"},
    {"Gulf of Mexico",       "Atlantic Ocean"},
    {"Gulf of Thailand",     "Pacific Ocean"},
    {"Gulf of Tonkin",       "Pacific Ocean"},
    {"Hudson Bay",           "Arctic Ocean"},
    {"Ionian Sea",           "Mediterranean Sea"},
    {"Irish Sea",            "Atlantic Ocean"},
    {"Irminger Sea",         "Atlantic Ocean"},
    {"James Bay",            "Atlantic Ocean"},
    {"Java Sea",             "Indian Ocean"},
    {"Kara Sea",             "Arctic Ocean"},
    {"Koro Sea",             "Pacific Ocean"},
    {"Labrador Sea",         "Atlantic Ocean"},
    {"Laccadive Sea",        "Indian Ocean"},
    {"Laptev Sea",           "Arctic Ocean"},
    {"Ligurian Sea",         "Mediterranean Sea"},
    {"Lincoln Sea",          "Arctic Ocean"},
    {"Myrtoan Sea",          "Mediterranean Sea"},
    {"North Sea",            "Atlantic Ocean"},
    {"Norwegian Sea",        "Atlantic Ocean"},
    {"Pechora Sea",          "Arctic Ocean"},
    {"Persian Gulf",         "Indian Ocean"},
    {"Philippine Sea",       "Pacific Ocean"},
    {"Red Sea",              "Indian Ocean"},
    {"Salish Sea",           "Pacific Ocean"},
    {"Sargasso Sea",         "Atlantic Ocean"},
    {"Scotia Sea",           "Southern Ocean"},
    {"Sea of Azov",          "Black Sea"},
    {"Sea of Chiloe",        "Pacific Ocean"},
    {"Sea of Crete",         "Mediterranean Sea"},
    {"Sea of Japan",         "Pacific Ocean"},
    {"Sea of Okhotsk",       "Pacific Ocean"},
    {"Sea of the Hebrides",  "Atlantic Ocean"},
    {"Sea of Zanj",          "Indian Ocean"},
    {"Seas of Greenland",    "Atlantic Ocean"},
    {"Sethusamudram",        "Indian Ocean"},
    {"Sibutu Passage",       "Pacific Ocean"},
    {"Solomon Sea",          "Pacific Ocean"},
    {"South China Sea",      "Pacific Ocean"},
    {"Sulu Sea",             "Pacific Ocean"},
    {"Tasman Sea",           "Pacific Ocean"},
    {"Thracian Sea",         "Mediterranean Sea"},
    {"Timor Sea",            "Indian Ocean"},
    {"Tyrrhenian Sea",       "Mediterranean Sea"},
    {"Wandel Sea",           "Arctic Ocean"},
    {"White Sea",            "Arctic Ocean"},
    {"Yellow Sea",           "Pacific Ocean"}
};
typedef CStaticArrayMap<const char*, const char*, PNocase_CStr> TWaterPairMap;
DEFINE_STATIC_ARRAY_MAP(TWaterPairMap, sc_WaterPairMap, k_water_pair_map);

static string x_FindSurroundingOcean (string& water)

{
    TWaterPairMap::const_iterator new_water_pair_iter = sc_WaterPairMap.find(water.c_str());
    if( new_water_pair_iter != sc_WaterPairMap.end() ) {
        return new_water_pair_iter->second;
    }
    return kEmptyStr;
}


string CSubSource::ValidateLatLonCountry (const string& input_countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode)
{
    errcode = eLatLonCountryErr_None;
    string countryname = input_countryname;
    if (NStr::IsBlank(countryname) || NStr::IsBlank(lat_lon)) {
        return kEmptyStr;
    }

    {
        static std::mutex m;

        std::lock_guard g(m);

        if ( m_LatLonCountryMap.get() == 0 ) {
            m_LatLonCountryMap.reset (new CLatLonCountryMap(false));
        }
        if ( m_LatLonWaterMap.get() == 0 ) {
            m_LatLonWaterMap.reset (new CLatLonCountryMap(true));
        }
    }

    // only do these checks if the latlon format is good
    bool format_correct, lat_in_range, lon_in_range, precision_correct;
    double lat_value = 0.0, lon_value = 0.0;
    CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
                               lat_in_range, lon_in_range,
                               lat_value, lon_value);
    if (!format_correct) {
        // may have comma and then altitude, so just get lat_lon component */
        size_t pos = NStr::Find(lat_lon, ",", NStr::eNocase, NStr::eReverseSearch);
        if (pos != NPOS) {
            lat_lon = lat_lon.substr(0, pos);
            CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
                                       lat_in_range, lon_in_range,
                                       lat_value, lon_value);
        }
    }

    // reality checks
    if (!format_correct || !lat_in_range || !lon_in_range) {
        // incorrect lat_lon format should be reported elsewhere
        // incorrect latitude range should be reported elsewhere
        // incorrect longitude range should be reported elsewhere
        return kEmptyStr;
    }

    // get rid of comments after semicolon or comma in country name
    size_t pos = NStr::Find(countryname, ";");
    if (pos != NPOS) {
         countryname = countryname.substr(0, pos);
        }
    pos = NStr::Find(countryname, ",");
    if (pos != NPOS) {
         countryname = countryname.substr(0, pos);
    }

    // adjust for special cases
    if (NStr::StartsWith(countryname, "Norway: Svalbard")) {
        countryname = "Svalbard";
    }

    string country = countryname;
    string province;
    pos = NStr::Find(country, ":");
    if (pos != NPOS) {
        // is the full string in the list?
        if (m_LatLonCountryMap->HaveLatLonForRegion(countryname)) {
            province = country.substr(pos + 1);
            NStr::TruncateSpacesInPlace(province, NStr::eTrunc_Both);
        }
        country = country.substr(0, pos);
        NStr::TruncateSpacesInPlace(country, NStr::eTrunc_Both);
    }
    if (NStr::IsBlank(country)) {
        return kEmptyStr;
    }

    // known exceptions - don't even bother calculating any further
    if (NStr::EqualNocase (country, "Antarctica") && lat_value < -60.0) {
        return kEmptyStr;
    }

    if (! NStr::IsBlank(province)) {
        // do not attempt quick exit
    } else if (m_LatLonCountryMap->HaveLatLonForRegion(country)) {
        if (m_LatLonCountryMap->IsCountryInLatLon(country, lat_value, lon_value)) {
            return kEmptyStr;
        }
    } else if (m_LatLonWaterMap->HaveLatLonForRegion(country)) {
        if (m_LatLonWaterMap->IsCountryInLatLon(country, lat_value, lon_value)) {
            return kEmptyStr;
        }
    } else if (NStr::EqualNocase (country, "State of Palestine")) {
    } else {
        // report unrecognized country
        return kEmptyStr;
    }

    CLatLonCountryId *id = x_CalculateLatLonId(lat_value, lon_value, country, province);
    CLatLonCountryId::TClassificationFlags flags = (id == NULL ? 0 : id->Classify(country, province));

    string wguess = id->GetGuessWater();
    string cguess = id->GetGuessCountry();

    // special case where subsection of country has been identified but is not in coordinates of country
    // VR-840
    if (province.empty() && NStr::Equal(cguess, country)) {
        delete id;
        return kEmptyStr;
    }

    if (NStr::EqualNocase (country, "State of Palestine") &&
        (NStr::EqualNocase (cguess, "Gaza Strip") ||
         NStr::EqualNocase (cguess, "West Bank"))) {
        delete id;
        return kEmptyStr;
    }

    if (NStr::IsBlank (cguess) && (! NStr::IsBlank (wguess))) {
        string parent = x_FindSurroundingOcean (wguess);
        if ((! NStr::IsBlank (parent)) && NStr::EqualNocase (country, parent)) {
            delete id;
            return kEmptyStr;
        }
    }

    double neardist = 0.0;
    CLatLonCountryMap::TLatLonAdjustFlags adjustment = CLatLonCountryMap::fNone;
    CLatLonCountryId::TClassificationFlags adjusted_flags = 0;

    if (!flags && m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 2.0, neardist, country) && neardist < 5.0) {
        id->SetGuessCountry (country);
        id->SetGuessProvince (kEmptyStr);
        flags = id->Classify(country, province);
    }

    if (!flags && !m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)
        && !m_LatLonWaterMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)) {
        /* do not flip from water */
        CLatLonCountryId *adjust_id = x_CalculateLatLonId(lon_value, lat_value, country, province);
        adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
        if (adjusted_flags) {
            string awguess = adjust_id->GetGuessWater();
            string acguess = adjust_id->GetGuessCountry();
            if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
                delete id;
                id = adjust_id;
                flags = adjusted_flags;
                adjustment = CLatLonCountryMap::fFlip;
            }
        } else {
            if (adjust_id) {
                delete adjust_id;
            }
            adjust_id = x_CalculateLatLonId(-lat_value, lon_value, country, province);
            adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
            if (adjusted_flags) {
                string awguess = adjust_id->GetGuessWater();
                string acguess = adjust_id->GetGuessCountry();
                if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
                    delete id;
                    id = adjust_id;
                    flags = adjusted_flags;
                    adjustment = CLatLonCountryMap::fNegateLat;
                }
            } else {
                if (adjust_id) {
                    delete adjust_id;
                }
                adjust_id = x_CalculateLatLonId(lat_value, -lon_value, country, province);
                adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
                if (adjusted_flags) {
                    string awguess = adjust_id->GetGuessWater();
                    string acguess = adjust_id->GetGuessCountry();
                    if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
                        delete id;
                        id = adjust_id;
                        flags = adjusted_flags;
                        adjustment = CLatLonCountryMap::fNegateLon;
                    }
                } else {
                    if (adjust_id) {
                        delete adjust_id;
                    }
                }
            }
        }
    }

    string error;

    if (adjustment != CLatLonCountryMap::fNone) {
        if (adjustment == CLatLonCountryMap::fFlip) {
            errcode = eLatLonCountryErr_Value;
            error = "Latitude and longitude values appear to be exchanged";
            lat_lon = MakeLatLon(lon_value, lat_value);
        } else if (adjustment == CLatLonCountryMap::fNegateLat) {
            errcode = eLatLonCountryErr_Value;
            if (lat_value < 0.0) {
                error = "Latitude should be set to N (northern hemisphere)";
            } else {
                error = "Latitude should be set to S (southern hemisphere)";
            }
            lat_lon = MakeLatLon(-lat_value, lon_value);
        } else if (adjustment == CLatLonCountryMap::fNegateLon) {
            errcode = eLatLonCountryErr_Value;
            if (lon_value < 0.0) {
                error = "Longitude should be set to E (eastern hemisphere)";
            } else {
                error = "Longitude should be set to W (western hemisphere)";
            }
            lat_lon = MakeLatLon(lat_value, -lon_value);
        }
    } else if ((flags & CLatLonCountryId::fCountryMatch) && (flags & CLatLonCountryId::fProvinceMatch)) {
        // success!  nothing to report
    } else if (flags & CLatLonCountryId::fWaterMatch) {
        // success!  nothing to report
    } else if (flags & CLatLonCountryId::fCountryMatch && NStr::IsBlank(province)) {
        if (check_state) {
            string full_guess = id->GetFullGuess();
            if (!NStr::Equal(full_guess, country)) {
                errcode = eLatLonCountryErr_State;
                error = "Lat_lon " + lat_lon + " is in " + id->GetFullGuess()
                    + " (more specific than " + country + ")";
            }
        }
    } else if (!NStr::IsBlank(id->GetGuessWater())) {
        if (flags & (CLatLonCountryId::fCountryClosest | CLatLonCountryId::fProvinceClosest)) {
            bool suppress = false;
            string reportregion;
            string nosubphrase;
            string desphrase = "designated subregion ";
            string subphrase = "another subregion ";
            string phrase = nosubphrase;
            bool show_claimed = false;

            if (id->GetLandDistance() < 100) {
                // for now, will not report
                // this is a policy decision
                suppress = true;
            } else if (NStr::Find(countryname, "Island") != NPOS) {
                suppress = true;
            }


            if (flags & CLatLonCountryId::fProvinceClosest) {
                reportregion = countryname;
                phrase = desphrase;
            } else {
                // wasn't closest province, so must be closest country
                if (!NStr::IsBlank(province) && check_state) {
                  phrase = subphrase;
                  reportregion = id->GetClosestFull();
                } else {
                  reportregion = id->GetClosestCountry();
                }
                if (!NStr::IsBlank(id->GetClaimedFull())) {
                  show_claimed = true;
                }
            }
            string water = id->GetGuessWater();
            if (NStr::EqualNocase (water, "Red Sea") &&
               (NStr::EqualNocase (reportregion, "Egypt") ||
                NStr::EqualNocase (reportregion, "Saudi Arabia") ||
                NStr::EqualNocase (reportregion, "Sudan") ||
                NStr::EqualNocase (reportregion, "Eritrea") ||
                NStr::EqualNocase (reportregion, "Dijibouti") ||
                NStr::EqualNocase (reportregion, "Yemen") ||
                NStr::EqualNocase (reportregion, "Israel") ||
                NStr::EqualNocase (reportregion, "Jordan"))) {
            } else if (NStr::EqualNocase (water, "Gulf of Mexico") &&
               (NStr::EqualNocase (reportregion, "USA") ||
                NStr::EqualNocase (reportregion, "Mexico"))) {
            } else if (!suppress) {
                errcode = eLatLonCountryErr_Water;
                if (show_claimed) {
                    error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion + "' at distance "
                            + NStr::IntToString(id->GetLandDistance())
                            + " km, but in water '" + id->GetGuessWater()
                            + "' - claimed region '" + id->GetClaimedFull()
                            + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
                } else {
                    error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion
                            + "' at distance " + NStr::IntToString(id->GetLandDistance()) + " km, but in water '"
                            + id->GetGuessWater() + "'";
                }
            }
        } else if (neardist > 0.0) {
            errcode = eLatLonCountryErr_Water;
            error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "', '"
                        + countryname + "' is " + NStr::IntToString(m_LatLonCountryMap->AdjustAndRoundDistance(neardist)) + " km away";
        } else {
            errcode = eLatLonCountryErr_Water;
            error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "'";
        }
    } else if (!NStr::IsBlank(id->GetGuessCountry())) {
        string full_guess = id->GetFullGuess();
        if (NStr::EqualNocase (country, "China") && NStr::EqualNocase (full_guess, "Hong Kong")) {
            // skip
        } else if (NStr::IsBlank(id->GetClaimedFull())) {
            if (NStr::Equal(id->GetGuessCountry(), country) && !NStr::Equal(id->GetGuessProvince(), province)) {
                errcode = eLatLonCountryErr_State;
            } else {
                errcode = eLatLonCountryErr_Country;
            }
            error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
                        + countryname + "'";
        } else {
            if (NStr::IsBlank(province)) {
                errcode = eLatLonCountryErr_Country;
                error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
                            + country + "' - claimed region '" + id->GetClaimedFull()
                            + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
            } else {
                errcode = eLatLonCountryErr_Country;
                if (NStr::EqualNocase(id->GetGuessCountry(), country)) {
                    errcode = eLatLonCountryErr_State;
                }
                if (errcode == eLatLonCountryErr_Country || check_state) {
                    error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
                                + countryname + "' - claimed region '" + id->GetClaimedFull()
                                + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
                } else {
                    errcode = eLatLonCountryErr_None;
                }
            }
        }
    } else if (!NStr::IsBlank(id->GetClosestCountry())) {
        errcode = eLatLonCountryErr_Country;
        error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestCountry() + "' instead of '"
                    + countryname + "'";
    } else if (!NStr::IsBlank(id->GetClosestWater())) {
        errcode = eLatLonCountryErr_Water;
        error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestWater() + "' instead of '"
                    + countryname + "'";
    } else {
        errcode = eLatLonCountryErr_Country;
        error = "Unable to determine mapping for lat_lon '" + lat_lon + "' and country '" + countryname + "'";
    }


    delete id;
    return error;
}


const char* sm_ValidSexQualifierTokens[] = {
  "asexual",
  "bisexual",
  "diecious",
  "dioecious",
  "f",
  "female",
  "gelding",
  "hermaphrodite",
  "intersex",
  "m",
  "male",
  "mixed",
  "monecious",
  "monoecious",
  "neuter",
  "unisexual",
};


const char* sm_ValidSexQualifierPhrases[] = {
  "pooled males and females",
  "pooled male and female",
};


bool s_IsValidSexQualifierPhrase(const string& value)
{
    size_t max = sizeof(sm_ValidSexQualifierPhrases) / sizeof(const char*);

    const char* *begin = sm_ValidSexQualifierPhrases;
    const char* *end = &(sm_ValidSexQualifierPhrases[max]);

    if (find(begin, end, value) != end) {
        return true;
    } else {
        return false;
    }
}


bool CSubSource::IsValidSexQualifierValue (const string& value)

{
    string str = value;
    NStr::ToLower(str);

    if (s_IsValidSexQualifierPhrase(str)) {
        return true;
    }

    vector<string> words;
    NStr::Split(str, " ,/", words);
    if (words.size() == 0) {
        return false;
    }

    size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);

    const char* *begin = sm_ValidSexQualifierTokens;
    const char* *end = &(sm_ValidSexQualifierTokens[max]);

    bool is_good = false;

    ITERATE(vector<string>, w, words) {
        if (NStr::Equal(*w, "and")) {
            // ok, skip it
        } else {
            if (find(begin, end, *w) != end) {
                is_good = true;
            } else {
                is_good = false;
                break;
            }
        }
    }
    return is_good;
}


string CSubSource::FixSexQualifierValue (const string& value)
{
    string str = value;
    NStr::ToLower(str);

    if (s_IsValidSexQualifierPhrase(str)) {
        return str;
    }

    vector<string> words;
    NStr::Split(str, " ,/", words);

    if (words.size() == 0) {
        return kEmptyStr;
    }
    size_t max = ArraySize(sm_ValidSexQualifierTokens);

    const char* *begin = sm_ValidSexQualifierTokens;
    const char* *end = &(sm_ValidSexQualifierTokens[max]);

    vector<string> good_values;
    bool pooled = false;

    ITERATE(vector<string>, w, words) {
        if (NStr::Equal(*w, "and")) {
            // ok, skip it
        } else if (NStr::EqualNocase(*w, "(pooled)") || NStr::EqualNocase(*w, "pooled")) {
            // set pooled flag
            pooled = true;
        } else {
            if (find(begin, end, *w) != end) {
                if (NStr::Equal(*w, "m")) {
                    good_values.push_back("male");
                } else if (NStr::Equal(*w, "f")) {
                    good_values.push_back("female");
                } else {
                    good_values.push_back(*w);
                }
            } else {
                // if any bad values, can't autofix
                return kEmptyStr;
            }
        }
    }
    if (good_values.size() == 0) {
        // no good tokens, can't autofix
        return kEmptyStr;
    }

    string fixed = good_values[0];
    for (size_t i = 1; i < good_values.size(); i++) {
        if (good_values.size() > 2) {
            fixed += ",";
        }
        if (i == good_values.size() - 1) {
            fixed += " and";
        }
        fixed += " " + good_values[i];
    }
    if (pooled) {
        fixed = "pooled " + fixed;
    }
    return fixed;
}


void s_CollectNumberAndUnits(const string& value, string& number, string& units)
{
    number.clear();
    units.clear();

    if (NStr::IsBlank(value)) {
        return;
    }

    string::const_iterator it = value.begin();
    if (*it == '+' || *it == '-') {
        number += *it;
        it++;
    }

    bool any_digit = false;
    bool skip_comma = true;
    while (it != value.end() && (isdigit(*it) || *it == ',')) {
        if (*it == ',') {
            if (skip_comma) {
                // only skip the first comma
                skip_comma = false;
            } else {
                break;
            }
        } else {
            any_digit = true;
            number += *it;
        }
        it++;
    }

    if (it == value.end()) {
        number.clear();
        return;
    }

    if (*it == '.') {
        number += *it;
        it++;
        while (it != value.end() && isdigit(*it)) {
            any_digit = true;
            number += *it;
            it++;
        }
    }

    if (it == value.end() || *it != ' ' || !any_digit) {
        number.clear();
        return;
    }

    it++;
    while (it != value.end()) {
        units += *it;
        it++;
    }
}


bool CSubSource::IsAltitudeValid (const string& value)
{
    if (NStr::IsBlank(value)) {
        return false;
    }

    string number;
    string units;
    s_CollectNumberAndUnits(value, number, units);
    if (NStr::IsBlank(number) || !NStr::EqualCase(units, "m")) {
        return false;
    } else {
        return true;
    }

}


int CSubSource::x_GetPrecision(const string& num_str)
{
    int precision = 0;
    size_t pos = NStr::Find(num_str, ".");
    if (pos != NPOS) {
        precision = int(num_str.length() - pos - 1);
    }
    return precision;
}


string CSubSource::x_FormatWithPrecision(double val, int precision)
{
    char reformatted[1000];
    sprintf(reformatted, "%.*lf", precision, val);
    string rval = reformatted;
    return rval;
}

string CSubSource::FixAltitude (const string& value)
{
    if (NStr::IsBlank(value)) {
        return kEmptyStr;
    }

    string number;
    string units;
    s_CollectNumberAndUnits(value, number, units);
    if (NStr::IsBlank(number)) {
        return kEmptyStr;
    } else if (NStr::Equal(units, "ft.") || NStr::Equal(units, "ft") || NStr::Equal(units, "feet") || NStr::Equal(units, "foot")) {
        int precision = x_GetPrecision(number);
        double val = NStr::StringToDouble(number);
        val *= 0.3048;
        number = x_FormatWithPrecision(val, precision);
        units = "m";
    }

    string rval = kEmptyStr;
    if (NStr::Equal(units, "m.")
        || NStr::Equal(units, "meters")
        || NStr::Equal(units, "meter")
        || NStr::Equal(units, "m")) {

        rval = number + " " + "m";
    }
    return rval;
}


// From VR-793:
// A.    For segment, endogenous_virus_name:
//   1.  Must begin with a letter or number
//   2.  Spaces and other printable characters are permitted
//   3.  Must not be empty, must not be longer than 240 characters

bool CSubSource::x_GenericRepliconNameValid(const string& value)
{
    if (NStr::IsBlank(value)) {
        return false;
    } else if (!isalnum(value.c_str()[0])) {
        return false;
    } else if (value.length() > 240) {
        return false;
    }

    for (auto it : value) {
        if (!isprint(it)) {
            return false;
        }
    }

    return true;
}


bool CSubSource::IsSegmentValid(const string& value)
{
    return x_GenericRepliconNameValid(value);
}


bool CSubSource::IsEndogenousVirusNameValid(const string& value)
{
    return x_GenericRepliconNameValid(value);
}


// From VR-793:
// B.    For chromosome, linkage_group and plasmid_name values:
//   4.  Must begin with a letter or number
//   5.  Must not be empty, must not be longer than 32 characters
//   6.  Must not contain <tab>
//   7.  Spaces and other printable characters are permitted
//   8.  Must not contain the word "plasmid" (ignoring case)
//   9.  Must not contain the word "chromosome" (ignoring case)
//   10. Must not contain the phrase "linkage group" (ignoring case)
//   11. Must not contain the series of letters "chr" (ignoring case)
//   12. Must not contain the taxname (ignoring case)
//   14. Must not contain the genus (ignoring case)
//   15. Must not contain the species (ignoring case)
//       except allow the species to match the value after an initial 'p' (e.g., JX416328)
//   16. Must not contain the series of letters "chrm" (ignoring case)
//   17. Must not contain the series of letters "chrom" (ignoring case)
//   18. Must not contain the phrase "linkage-group" (ignoring case)
static bool s_FailsGenusOrSpeciesTest(const string& value, const string& taxname)
{ // See RW-1436
    if (NStr::IsBlank(taxname) ||
        NStr::StartsWith(taxname, "Plasmid ", NStr::eNocase) ||
        NStr::StartsWith(taxname, "IncQ plasmid", NStr::eNocase)) {
        return false;
    }

    size_t pos = NStr::Find(taxname, " ");
    if (pos != NPOS) {
        string genus = taxname.substr(0, pos);
        if (NStr::FindNoCase(value, genus) != NPOS) {
                // B.14
                return true;
        }
        string species = taxname.substr(pos + 1);

        pos = NStr::FindNoCase(value, species);
        if (pos != NPOS) {
            if (pos != 1 || value[0] != 'p') {
                // B.15
                return true;
            }
        }
    }

    return false;
}

bool CSubSource::x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string& value, const string& taxname)
{
    if (NStr::FindNoCase(taxname, "Borrelia") != NPOS || NStr::FindNoCase(taxname, "Borreliella") != NPOS) {
        if (NStr::StartsWith(value, "cp") || NStr::StartsWith(value, "lp")) {
            return true;
        }
    }
    if (!x_GenericRepliconNameValid(value)) {
        // checks for isalnum start, blankness and unprintable characters
        // B.4, B.5, B.7
        return false;
    } else if (value.length() > 32) {
        // B.5
        return false;
    }

    if (s_FailsGenusOrSpeciesTest(value, taxname)) {
        return false;
    }

    static string s_ForbiddenPhrases[] = {
        "\t",  // B.6.
        "plasmid", // B.8
        "chromosome", // B.9
        "linkage group", // B.10
        "chr", // B.11
        "linkage_group", // B.15
        "chrm", // B.16
        "chrom", // B.17
        "linkage-group" // B.18
    };

    for (auto it : s_ForbiddenPhrases) {
        if (NStr::FindNoCase(value, it) != NPOS) {
            return false;
        }
    }
    return true;
}


bool CSubSource::IsChromosomeNameValid(const string& value, const string& taxname)
{
    if (NStr::IsBlank(value)) {
        return false;
    }
    if (NStr::StartsWith(value, "LG", NStr::eNocase)) {
        return false;
    } else {
        return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(value, taxname);
    }
}


bool CSubSource::IsLinkageGroupNameValid(const string& value, const string& taxname)
{
    if (NStr::IsBlank(value)) {
        return false;
    }
    return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(value, taxname);
}


// VR-793
// C.    For plasmid_name values:
//   19. Exception- megaplasmid is legal
bool CSubSource::IsPlasmidNameValid(const string& value, const string& taxname)
{
    if (NStr::IsBlank(value)) {
        return false;
    }
    if (NStr::Equal(value, "megaplasmid")) {
        return true;
    }
    if (NStr::StartsWith(value, "megaplasmid ") && value.length() > 12 && NStr::Find(value.substr(12), " ") == NPOS) {
        return true;
    }
    if (NStr::Equal(value, "F") || NStr::Equal(value, "F factor") || NStr::Equal(value, "F plasmid")) {
        return true;
    }

    if (NStr::FindNoCase(value,"plasmid") != NPOS) {
        static const set<string, PNocase_Conditional> s_PlasmidNameExceptions =
        { // This list comes from RW-1436/RW-1430
            "Plasmid F",
            "Plasmid R",
            "Plasmid pIP630",
            "Plasmid pNG2",
            "Plasmid pGT633",
            "Plasmid pE5",
            "Plasmid pIP1527",
            "Plasmid pAM77",
            "Plasmid pAZ1",
            "Plasmid RP4"
        };

        if (s_PlasmidNameExceptions.find(value) != end(s_PlasmidNameExceptions)) {
            return true;
        }
        return false;
    }

    return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(value, taxname);
}


typedef pair<string, string> TContaminatingCellLine;
typedef map<string, TContaminatingCellLine> TSpeciesContaminant;
typedef map<string, TSpeciesContaminant> TCellLineContaminationMap;

static TCellLineContaminationMap s_CellLineContaminationMap;
static bool s_CellLineContaminationMapInitialized = false;
DEFINE_STATIC_FAST_MUTEX(s_CellLineContaminationMutex);

#include "cell_line.inc"

static void s_ProcessCellLineLine(const CTempString& line)
{
    vector<string> tokens;
    NStr::Split(line, "\t", tokens);
    if (tokens.size() < 4) {
        ERR_POST_X(1, Warning << "Not enough columns in cell_line entry " << line
                   << "; disregarding");
    } else {
        NStr::ToUpper(tokens[0]);
        (s_CellLineContaminationMap[tokens[0]])[tokens[1]] = TContaminatingCellLine(tokens[2], tokens[3]);
    }
}


static void s_InitializeCellLineContaminationMap(void)
{
    CFastMutexGuard GUARD(s_CellLineContaminationMutex);
    if (s_CellLineContaminationMapInitialized) {
        return;
    }

    // read table

    size_t count = sizeof(kCellLine) / sizeof (*kCellLine);
    const char * const * start = kCellLine;
    while (count--) {
        s_ProcessCellLineLine(*start++);
    }


    s_CellLineContaminationMapInitialized = true;
}


string CSubSource::CheckCellLine(const string& cell_line, const string& organism)
{
    string rval;

    s_InitializeCellLineContaminationMap();
    string cell_line_search = cell_line;
    NStr::ToUpper(cell_line_search);

    if (!NStr::IsBlank(((s_CellLineContaminationMap[cell_line_search])[organism]).first)) {
        rval = "The International Cell Line Authentication Committee database indicates that " +
               cell_line + " from " + organism + " is known to be contaminated by " +
               ((s_CellLineContaminationMap[cell_line_search])[organism]).first +
               " from " + ((s_CellLineContaminationMap[cell_line_search])[organism]).second +
               ". Please see http://iclac.org/databases/cross-contaminations/ for more information and references.";
    }
    return rval;
}


// =============================================================================
//                                 Country Names
// =============================================================================


// legal country names, must be in alphabetical order (case sensitive)
static const char* const s_Countries[] = {
    "Afghanistan",
    "Albania",
    "Algeria",
    "American Samoa",
    "Andorra",
    "Angola",
    "Anguilla",
    "Antarctica",
    "Antigua and Barbuda",
    "Arctic Ocean",
    "Argentina",
    "Armenia",
    "Aruba",
    "Ashmore and Cartier Islands",
    "Atlantic Ocean",
    "Australia",
    "Austria",
    "Azerbaijan",
    "Bahamas",
    "Bahrain",
    "Baker Island",
    "Baltic Sea",
    "Bangladesh",
    "Barbados",
    "Bassas da India",
    "Belarus",
    "Belgium",
    "Belize",
    "Benin",
    "Bermuda",
    "Bhutan",
    "Bolivia",
    "Borneo",
    "Bosnia and Herzegovina",
    "Botswana",
    "Bouvet Island",
    "Brazil",
    "British Virgin Islands",
    "Brunei",
    "Bulgaria",
    "Burkina Faso",
    "Burundi",
    "Cambodia",
    "Cameroon",
    "Canada",
    "Cape Verde",
    "Cayman Islands",
    "Central African Republic",
    "Chad",
    "Chile",
    "China",
    "Christmas Island",
    "Clipperton Island",
    "Cocos Islands",
    "Colombia",
    "Comoros",
    "Cook Islands",
    "Coral Sea Islands",
    "Costa Rica",
    "Cote d'Ivoire",
    "Croatia",
    "Cuba",
    "Curacao",
    "Cyprus",
    "Czech Republic",
    "Democratic Republic of the Congo",
    "Denmark",
    "Djibouti",
    "Dominica",
    "Dominican Republic",
    "Ecuador",
    "Egypt",
    "El Salvador",
    "Equatorial Guinea",
    "Eritrea",
    "Estonia",
    "Eswatini",
    "Ethiopia",
    "Europa Island",
    "Falkland Islands (Islas Malvinas)",
    "Faroe Islands",
    "Fiji",
    "Finland",
    "France",
    "French Guiana",
    "French Polynesia",
    "French Southern and Antarctic Lands",
    "Gabon",
    "Gambia",
    "Gaza Strip",
    "Georgia",
    "Germany",
    "Ghana",
    "Gibraltar",
    "Glorioso Islands",
    "Greece",
    "Greenland",
    "Grenada",
    "Guadeloupe",
    "Guam",
    "Guatemala",
    "Guernsey",
    "Guinea",
    "Guinea-Bissau",
    "Guyana",
    "Haiti",
    "Heard Island and McDonald Islands",
    "Honduras",
    "Hong Kong",
    "Howland Island",
    "Hungary",
    "Iceland",
    "India",
    "Indian Ocean",
    "Indonesia",
    "Iran",
    "Iraq",
    "Ireland",
    "Isle of Man",
    "Israel",
    "Italy",
    "Jamaica",
    "Jan Mayen",
    "Japan",
    "Jarvis Island",
    "Jersey",
    "Johnston Atoll",
    "Jordan",
    "Juan de Nova Island",
    "Kazakhstan",
    "Kenya",
    "Kerguelen Archipelago",
    "Kingman Reef",
    "Kiribati",
    "Kosovo",
    "Kuwait",
    "Kyrgyzstan",
    "Laos",
    "Latvia",
    "Lebanon",
    "Lesotho",
    "Liberia",
    "Libya",
    "Liechtenstein",
    "Line Islands",
    "Lithuania",
    "Luxembourg",
    "Macau",
    "Madagascar",
    "Malawi",
    "Malaysia",
    "Maldives",
    "Mali",
    "Malta",
    "Marshall Islands",
    "Martinique",
    "Mauritania",
    "Mauritius",
    "Mayotte",
    "Mediterranean Sea",
    "Mexico",
    "Micronesia, Federated States of",
    "Midway Islands",
    "Moldova",
    "Monaco",
    "Mongolia",
    "Montenegro",
    "Montserrat",
    "Morocco",
    "Mozambique",
    "Myanmar",
    "Namibia",
    "Nauru",
    "Navassa Island",
    "Nepal",
    "Netherlands",
    "New Caledonia",
    "New Zealand",
    "Nicaragua",
    "Niger",
    "Nigeria",
    "Niue",
    "Norfolk Island",
    "North Korea",
    "North Macedonia",
    "North Sea",
    "Northern Mariana Islands",
    "Norway",
    "Oman",
    "Pacific Ocean",
    "Pakistan",
    "Palau",
    "Palmyra Atoll",
    "Panama",
    "Papua New Guinea",
    "Paracel Islands",
    "Paraguay",
    "Peru",
    "Philippines",
    "Pitcairn Islands",
    "Poland",
    "Portugal",
    "Puerto Rico",
    "Qatar",
    "Republic of the Congo",
    "Reunion",
    "Romania",
    "Ross Sea",
    "Russia",
    "Rwanda",
    "Saint Barthelemy",
    "Saint Helena",
    "Saint Kitts and Nevis",
    "Saint Lucia",
    "Saint Martin",
    "Saint Pierre and Miquelon",
    "Saint Vincent and the Grenadines",
    "Samoa",
    "San Marino",
    "Sao Tome and Principe",
    "Saudi Arabia",
    "Senegal",
    "Serbia",
    "Seychelles",
    "Sierra Leone",
    "Singapore",
    "Sint Maarten",
    "Slovakia",
    "Slovenia",
    "Solomon Islands",
    "Somalia",
    "South Africa",
    "South Georgia and the South Sandwich Islands",
    "South Korea",
    "South Sudan",
    "Southern Ocean",
    "Spain",
    "Spratly Islands",
    "Sri Lanka",
    "State of Palestine",
    "Sudan",
    "Suriname",
    "Svalbard",
    "Sweden",
    "Switzerland",
    "Syria",
    "Taiwan",
    "Tajikistan",
    "Tanzania",
    "Tasman Sea",
    "Thailand",
    "Timor-Leste",
    "Togo",
    "Tokelau",
    "Tonga",
    "Trinidad and Tobago",
    "Tromelin Island",
    "Tunisia",
    "Turkey",
    "Turkmenistan",
    "Turks and Caicos Islands",
    "Tuvalu",
    "USA",
    "Uganda",
    "Ukraine",
    "United Arab Emirates",
    "United Kingdom",
    "Uruguay",
    "Uzbekistan",
    "Vanuatu",
    "Venezuela",
    "Viet Nam",
    "Virgin Islands",
    "Wake Island",
    "Wallis and Futuna",
    "West Bank",
    "Western Sahara",
    "Yemen",
    "Zambia",
    "Zimbabwe"
};
static const TCStrSet s_CountriesSet(s_Countries, sizeof(s_Countries), __FILE__, __LINE__);

// former legal country names, must be in alphabetical order (case sensitive)
static const char* const s_Former_Countries[] = {
    "Belgian Congo",
    "British Guiana",
    "Burma",
    "Czechoslovakia",
    "East Timor",
    "Korea",
    "Macedonia",
    "Micronesia",
    "Netherlands Antilles",
    "Serbia and Montenegro",
    "Siam",
    "Swaziland",
    "The former Yugoslav Republic of Macedonia",
    "USSR",
    "Yugoslavia",
    "Zaire"
};
static const TCStrSet s_Former_CountriesSet(s_Former_Countries, sizeof(s_Former_Countries), __FILE__, __LINE__);

// null term exemption values, must be in alphabetical order (case sensitive)
static const char* const s_Null_Countries[] = {
    "missing",
    "missing: control sample",
    "missing: data agreement established pre-2023",
    "missing: endangered species",
    "missing: human-identifiable",
    "missing: lab stock",
    "missing: sample group",
    "missing: synthetic construct",
    "missing: third party data",
    "not applicable",
    "not collected",
    "not provided",
    "restricted access"
};
static const TCStrSet s_Null_CountriesSet(s_Null_Countries, sizeof(s_Null_Countries), __FILE__, __LINE__);

bool CCountries::IsValid(const string& country)
{
    string name = country;
    size_t pos = country.find(':');

    if ( pos != NPOS ) {
        if (pos == country.length() - 1) {
            return false;
        }
        name = country.substr(0, pos);
    }

    // try current countries
    if (s_CountriesSet.find(name.c_str()) != s_CountriesSet.end()) {
        return true;
    } else if (s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end()) {
        return true;
    } else if (s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end()) {
        return true;
    } else {
        return false;
    }
}


bool CCountries::IsValid(const string& country, bool& is_miscapitalized)
{
    string name = country;
    size_t pos = country.find(':');

    if ( pos != NPOS ) {
        name = country.substr(0, pos);
        if (pos == country.length() - 1) {
            return false;
        }
    }

    is_miscapitalized = false;
    // try current countries
    // fast check for properly capitalized
    if ( s_CountriesSet.find(name.c_str()) != s_CountriesSet.end() ) {
        return true;
    }
    if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
        return true;
    }
    if ( s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end() ) {
        return true;
    }
    // slow check for miscapitalized
    ITERATE ( TCStrSet, it, s_CountriesSet ) {
        if ( NStr::EqualNocase(name, *it) ) {
            is_miscapitalized = true;
            return true;
        }
    }
    ITERATE ( TCStrSet, it, s_Former_CountriesSet ) {
        if ( NStr::EqualNocase(name, *it) ) {
            is_miscapitalized = true;
            return true;
        }
    }
    ITERATE ( TCStrSet, it, s_Null_CountriesSet ) {
        if ( NStr::EqualNocase(name, *it) ) {
            is_miscapitalized = true;
            return true;
        }
    }

    return false;
}


bool CCountries::WasValid(const string& country)
{
    string name = country;
    size_t pos = country.find(':');

    if ( pos != NPOS ) {
        name = country.substr(0, pos);
    }

    // try formerly-valid countries
    return s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end();
}


bool CCountries::WasValid(const string& country, bool& is_miscapitalized)
{
    string name = country;
    size_t pos = country.find(':');

    if ( pos != NPOS ) {
        name = country.substr(0, pos);
    }

    is_miscapitalized = false;
    // try formerly-valid countries
    // fast check for properly capitalized
    if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
        return true;
    }
    // slow check for miscapitalized
    ITERATE ( TCStrSet, it, s_Former_CountriesSet ) {
        if ( NStr::EqualNocase(name, *it) ) {
            is_miscapitalized = true;
            return true;
        }
    }
    return false;
}

/////////////////////////////////////////////////////////////////////////////
////// Country Capitalization Fix ///////////////////////////////////////////

static const SStaticPair<const char*, const char*> s_map_whole_country_fixes[] =
{
  {"england", "United Kingdom: England"},
  {"great britain", "United Kingdom: Great Britain"},
  {"new jersey, usa", "USA: New Jersey"}
};
typedef CStaticPairArrayMap<const char*, const char*, PCase_CStr> TCStringPairsMap;
DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap, k_whole_country_fixes, s_map_whole_country_fixes);

static const SStaticPair<const char*, const char*> s_map_country_name_fixes[] = {
{"ABW", "Aruba"},
{"AFG", "Afghanistan"},
{"AGO", "Angola"},
{"AIA", "Anguilla"},
{"ALA", "Aland Islands"},
{"ALB", "Albania"},
{"AND", "Andorra"},
{"ARE", "United Arab Emirates"},
{"ARG", "Argentina"},
{"ARM", "Armenia"},
{"ASM", "American Samoa"},
{"ATA", "Antarctica"},
{"ATF", "French Southern Territories"},
{"ATG", "Antigua and Barbuda"},
{"AUS", "Australia"},
{"AUT", "Austria"},
{"AZE", "Azerbaijan"},
{"Antigua & Barbuda", "Antigua and Barbuda"},
{"Ashmore & Cartier Islands", "Ashmore and Cartier Islands"},
{"BDI", "Burundi"},
{"BEL", "Belgium"},
{"BEN", "Benin"},
{"BES", "Bonaire, Sint Eustatius and Saba"},
{"BFA", "Burkina Faso"},
{"BGD", "Bangladesh"},
{"BGR", "Bulgaria"},
{"BHR", "Bahrain"},
{"BHS", "Bahamas"},
{"BIH", "Bosnia and Herzegovina"},
{"BLM", "Saint Barthelemy"},
{"BLR", "Belarus"},
{"BLZ", "Belize"},
{"BMU", "Bermuda"},
{"BOL", "Bolivia"},
{"BRA", "Brazil"},
{"BRB", "Barbados"},
{"BRN", "Brunei"},
{"BTN", "Bhutan"},
{"BVT", "Bouvet Island"},
{"BWA", "Botswana"},
{"Brasil", "Brazil"},
{"CAF", "Central African Republic"},
{"CAN", "Canada"},
{"CCK", "Cocos Islands"},
{"CHE", "Switzerland"},
{"CHL", "Chile"},
{"CHN", "China"},
{"CIV", "Cote d'Ivoire"},
{"CMR", "Cameroon"},
{"COD", "Democratic Republic of the Congo"},
{"COG", "Republic of the Congo"},
{"COK", "Cook Islands"},
{"COL", "Colombia"},
{"COM", "Comoros"},
{"CPV", "Cape Verde"},
{"CRI", "Costa Rica"},
{"CUB", "Cuba"},
{"CUW", "Curacao"},
{"CXR", "Christmas Island"},
{"CYM", "Cayman Islands"},
{"CYP", "Cyprus"},
{"CZE", "Czech Republic"},
{"Cape Verde Islands", "Cape Verde"},
{"DEU", "Germany"},
{"DJI", "Djibouti"},
{"DMA", "Dominica"},
{"DNK", "Denmark"},
{"DOM", "Dominican Republic"},
{"DZA", "Algeria"},
{"Democratic Republic of Congo", "Democratic Republic of the Congo"},
{"ECU", "Ecuador"},
{"EGY", "Egypt"},
{"ERI", "Eritrea"},
{"ESH", "Western Sahara"},
{"ESP", "Spain"},
{"EST", "Estonia"},
{"ETH", "Ethiopia"},
{"FIN", "Finland"},
{"FJI", "Fiji"},
{"FLK", "Falkland Islands (Islas Malvinas)"},
{"FRA", "France"},
{"FRO", "Faroe Islands"},
{"FSM", "Micronesia, Federated States of"},
{"Falkland Islands", "Falkland Islands (Islas Malvinas)"},
{"French Southern & Antarctic Lands", "French Southern and Antarctic Lands"},
{"GAB", "Gabon"},
{"GBR", "United Kingdom"},
{"GEO", "Georgia"},
{"GGY", "Guernsey"},
{"GHA", "Ghana"},
{"GIB", "Gibraltar"},
{"GIN", "Guinea"},
{"GLP", "Guadeloupe"},
{"GMB", "Gambia"},
{"GNB", "Guinea-Bissau"},
{"GNQ", "Equatorial Guinea"},
{"GRC", "Greece"},
{"GRD", "Grenada"},
{"GRL", "Greenland"},
{"GTM", "Guatemala"},
{"GUF", "French Guiana"},
{"GUM", "Guam"},
{"GUY", "Guyana"},
{"HKG", "Hong Kong"},
{"HMD", "Heard Island and McDonald Islands"},
{"HND", "Honduras"},
{"HRV", "Croatia"},
{"HTI", "Haiti"},
{"HUN", "Hungary"},
{"Heard Island & McDonald Islands", "Heard Island and McDonald Islands"},
{"IDN", "Indonesia"},
{"IMN", "Isle of Man"},
{"IND", "India"},
{"IOT", "British Indian Ocean Territory"},
{"IRL", "Ireland"},
{"IRN", "Iran"},
{"IRQ", "Iraq"},
{"ISL", "Iceland"},
{"ISR", "Israel"},
{"ITA", "Italy"},
{"Ivory Coast", "Cote d'Ivoire"},
{"JAM", "Jamaica"},
{"JEY", "Jersey"},
{"JOR", "Jordan"},
{"JPN", "Japan"},
{"KAZ", "Kazakhstan"},
{"KEN", "Kenya"},
{"KGZ", "Kyrgyzstan"},
{"KHM", "Cambodia"},
{"KIR", "Kiribati"},
{"KNA", "Saint Kitts and Nevis"},
{"KOR", "South Korea"},
{"KWT", "Kuwait"},
{"LAO", "Lao People's Democratic Republic"},
{"LBN", "Lebanon"},
{"LBR", "Liberia"},
{"LBY", "Libyan Arab Jamahiriya"},
{"LCA", "Saint Lucia"},
{"LIE", "Liechtenstein"},
{"LKA", "Sri Lanka"},
{"LSO", "Lesotho"},
{"LTU", "Lithuania"},
{"LUX", "Luxembourg"},
{"LVA", "Latvia"},
{"La Reunion Island", "Reunion"},
{"Luxemburg", "Luxembourg"},
{"MAC", "Macao"},
{"MAF", "Saint Martin (French part)"},
{"MAR", "Morocco"},
{"MCO", "Monaco"},
{"MDA", "Moldova"},
{"MDG", "Madagascar"},
{"MDV", "Maldives"},
{"MEX", "Mexico"},
{"MHL", "Marshall Islands"},
{"MKD", "North Macedonia"},
{"MLI", "Mali"},
{"MLT", "Malta"},
{"MMR", "Myanmar"},
{"MNE", "Montenegro"},
{"MNG", "Mongolia"},
{"MNP", "Northern Mariana Islands"},
{"MOZ", "Mozambique"},
{"MRT", "Mauritania"},
{"MSR", "Montserrat"},
{"MTQ", "Martinique"},
{"MUS", "Mauritius"},
{"MWI", "Malawi"},
{"MYS", "Malaysia"},
{"MYT", "Mayotte"},
{"Macedonia", "North Macedonia"},
{"NAM", "Namibia"},
{"NCL", "New Caledonia"},
{"NER", "Niger"},
{"NFK", "Norfolk Island"},
{"NGA", "Nigeria"},
{"NIC", "Nicaragua"},
{"NIU", "Niue"},
{"NLD", "Netherlands"},
{"NOR", "Norway"},
{"NPL", "Nepal"},
{"NRU", "Nauru"},
{"NZL", "New Zealand"},
{"Netherland", "Netherlands"},
{"New Guinea", "Papua New Guinea"},
{"OMN", "Oman"},
{"P, R, China", "China"},
{"P.R. China", "China"},
{"P.R.China", "China"},
{"PAK", "Pakistan"},
{"PAN", "Panama"},
{"PCN", "Pitcairn"},
{"PER", "Peru"},
{"PHL", "Philippines"},
{"PLW", "Palau"},
{"PNG", "Papua New Guinea"},
{"POL", "Poland"},
{"PRI", "Puerto Rico"},
{"PRK", "North Korea"},
{"PRT", "Portugal"},
{"PRY", "Paraguay"},
{"PSE", "Palestinian Territory"},
{"PYF", "French Polynesia"},
{"People's Republic of China", "China"},
{"Pr China", "China"},
{"Prchina", "China"},
{"QAT", "Qatar"},
{"REU", "Reunion"},
{"ROU", "Romania"},
{"RUS", "Russia"},
{"RWA", "Rwanda"},
{"Republic of Congo", "Republic of the Congo"},
{"SAU", "Saudi Arabia"},
{"SDN", "Sudan"},
{"SEN", "Senegal"},
{"SGP", "Singapore"},
{"SGS", "South Georgia and the South Sandwich Islands"},
{"SHN", "Saint Helena"},
{"SJM", "Svalbard and Jan Mayen"},
{"SLB", "Solomon Islands"},
{"SLE", "Sierra Leone"},
{"SLV", "El Salvador"},
{"SMR", "San Marino"},
{"SOM", "Somalia"},
{"SPM", "Saint Pierre and Miquelon"},
{"SRB", "Serbia"},
{"SSD", "South Sudan"},
{"STP", "Sao Tome and Principe"},
{"SUR", "Suriname"},
{"SVK", "Slovakia"},
{"SVN", "Slovenia"},
{"SWE", "Sweden"},
{"SWZ", "Eswatini"},
{"SXM", "Sint Maarten (Dutch part)"},
{"SYC", "Seychelles"},
{"SYR", "Syrian Arab Republic"},
{"Saint Kitts & Nevis", "Saint Kitts and Nevis"},
{"Saint Pierre & Miquelon", "Saint Pierre and Miquelon"},
{"Saint Vincent & Grenadines", "Saint Vincent and the Grenadines"},
{"Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines"},
{"Saint Vincent and Grenadines", "Saint Vincent and the Grenadines"},
{"San Tome and Principe Island", "Sao Tome and Principe"},
{"Sao Tome & Principe", "Sao Tome and Principe"},
{"South Georgia & South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
{"South Georgia & the South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
{"St Helena", "Saint Helena"},
{"St Lucia", "Saint Lucia"},
{"St Pierre and Miquelon", "Saint Pierre and Miquelon"},
{"St Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
{"St. Helena", "Saint Helena"},
{"St. Lucia", "Saint Lucia"},
{"St. Pierre and Miquelon", "Saint Pierre and Miquelon"},
{"St. Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
{"TCA", "Turks and Caicos Islands"},
{"TCD", "Chad"},
{"TGO", "Togo"},
{"THA", "Thailand"},
{"TJK", "Tajikistan"},
{"TKL", "Tokelau"},
{"TKM", "Turkmenistan"},
{"TLS", "Timor-Leste"},
{"TON", "Tonga"},
{"TTO", "Trinidad and Tobago"},
{"TUN", "Tunisia"},
{"TUR", "Turkey"},
{"TUV", "Tuvalu"},
{"TWN", "Taiwan"},
{"TZA", "Tanzania"},
{"The Netherlands", "Netherlands"},
{"Trinidad & Tobago", "Trinidad and Tobago"},
{"Turks & Caicos", "Turks and Caicos Islands"},
{"Turks & Caicos Islands", "Turks and Caicos Islands"},
{"Turks and Caicos", "Turks and Caicos Islands"},
{"U.S.A.", "USA"},
{"UGA", "Uganda"},
{"UK", "United Kingdom"},
{"UKR", "Ukraine"},
{"UMI", "United States Minor Outlying Islands"},
{"URY", "Uruguay"},
{"UZB", "Uzbekistan"},
{"United States", "USA"},
{"United States of America", "USA"},
{"VAT", "Holy See (Vatican City State)"},
{"VCT", "Saint Vincent and the Grenadines"},
{"VEN", "Venezuela"},
{"VGB", "British Virgin Islands"},
{"VIR", "Virgin Islands"},
{"VNM", "Viet Nam"},
{"VUT", "Vanuatu"},
{"Vietnam", "Viet Nam"},
{"WLF", "Wallis and Futuna"},
{"WSM", "Samoa"},
{"YEM", "Yemen"},
{"ZAF", "South Africa"},
{"ZMB", "Zambia"},
{"ZWE", "Zimbabwe"},
{"the Netherlands", "Netherlands"}
};

DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_country_name_fixes, s_map_country_name_fixes);

// for GP-24841
static const SStaticPair<const char*, const char*> s_map_old_country_name_fixes[] = {
{"Burma", "Myanmar"},
{"Siam", "Thailand"}
};
DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_old_country_name_fixes, s_map_old_country_name_fixes);

// for GB-7408
static const SStaticPair<const char*, const char*> s_map_subregion_fixes[] = {
{"Antigua", "Antigua and Barbuda: Antigua"},
{"Ashmore Island", "Ashmore and Cartier Islands: Ashmore Island"},
{"Autonomous Region of the Azores", "Portugal: Azores"},
{"Azores", "Portugal: Azores"},
{"Barbuda", "Antigua and Barbuda: Barbuda"},
{"Bassas da India", "French Southern and Antarctic Lands: Bassas da India"},
{"Caicos Islands", "Turks and Caicos Islands: Caicos Islands"},
{"Canary Islands", "Spain: Canary Islands"},
{"Cartier Island", "Ashmore and Cartier Islands: Cartier Island"},
{"East Germany", "Germany: East Germany"},
{"El Hierro", "Spain: El Hierro"},
{"Europa Island", "French Southern and Antarctic Lands: Europa Island"},
{"Fuerteventura", "Spain: Fuerteventura"},
{"Glorioso Islands", "French Southern and Antarctic Lands: Glorioso Islands"},
{"Gran Canaria", "Spain: Gran Canaria"},
{"Grenadines", "Saint Vincent and the Grenadines: Grenadines"},
{"Heard Island", "Heard Island and McDonald Islands: Heard Island"},
{"Ile Amsterdam", "French Southern and Antarctic Lands: Ile Amsterdam"},
{"Ile Saint-Paul", "French Southern and Antarctic Lands: Ile Saint-Paul"},
{"Iles Crozet", "French Southern and Antarctic Lands: Iles Crozet"},
{"Iles Kerguelen", "French Southern and Antarctic Lands: Iles Kerguelen"},
{"Juan de Nova Island", "French Southern and Antarctic Lands: Juan de Nova Island"},
{"La Gomera", "Spain: La Gomera"},
{"La Graciosa", "Spain: La Graciosa"},
{"La Palma", "Spain: La Palma"},
{"Lanzarote", "Spain: Lanzarote"},
{"Madeira", "Portugal: Madeira"},
{"McDonald Island", "Heard Island and McDonald Islands: McDonald Island"},
{"McDonald Islands", "Heard Island and McDonald Islands: McDonald Islands"},
{"Miquelon", "Saint Pierre and Miquelon: Miquelon"},
{"Nevis", "Saint Kitts and Nevis: Nevis"},
{"Principe", "Sao Tome and Principe: Principe"},
{"Saint Kitts", "Saint Kitts and Nevis: Saint Kitts"},
{"Saint Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
{"Saint Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
{"Sao Tome", "Sao Tome and Principe: Sao Tome"},
{"Scotland", "United Kingdom: Scotland"},
{"South Sandwich Islands", "South Georgia and the South Sandwich Islands: South Sandwich Islands"},
{"St Kitts", "Saint Kitts and Nevis: Saint Kitts"},
{"St Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
{"St Thomas", "USA: Saint Thomas"},
{"St Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
{"St. Kitts", "Saint Kitts and Nevis: Saint Kitts"},
{"St. Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
{"St. Thomas", "USA: Saint Thomas"},
{"St. Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
{"Tenerife", "Spain: Tenerife"},
{"Tobago", "Trinidad and Tobago: Tobago"},
{"Trinidad", "Trinidad and Tobago: Trinidad"},
{"Tromelin Island", "French Southern and Antarctic Lands: Tromelin Island"},
{"Turks Islands", "Turks and Caicos Islands: Turks Islands"},
{"Wales", "United Kingdom: Wales"},
{"West Germany", "Germany: West Germany"},

};
DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_subregion_fixes, s_map_subregion_fixes);


static const char* s_USAStates[] = {
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "District of Columbia",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming"
};

string CCountries::CapitalizeFirstLetterOfEveryWord (const string &phrase)
{
    vector<string> words;
    NStr::Split(phrase, " \t\r\n", words);
    for(vector<string>::iterator word = words.begin(); word != words.end(); ++word)
        if (!word->empty() && isalpha(word->at(0)))
            word->at(0) = (unsigned char)toupper(word->at(0));
    return NStr::Join(words," ");
}

string CCountries::WholeCountryFix(string country)
{
    string new_country;
    TCStringPairsMap::const_iterator found = k_whole_country_fixes.find(NStr::ToLower(country).c_str());
    if (found != k_whole_country_fixes.end()) {
        new_country = found->second;
        return new_country;
    }

    const size_t num_states = sizeof(s_USAStates) / sizeof(s_USAStates[0]);
    for (size_t i = 0; i < num_states; ++i) {
        if (NStr::EqualNocase(s_USAStates[i], country)) {
            new_country = "USA: " + CTempString(s_USAStates[i]);
            break;
        }
    }

    return new_country;
}

bool CCountries::IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1)
{
    bool r = false;
    ITERATE ( TCStrSet, c, s_CountriesSet )
    {
        string country2(*c);
        if (country2.length() > country1.length() && NStr::FindNoCase(country2,country1) != NPOS)
        {
            SIZE_TYPE pos2 = NStr::FindNoCase(phrase,country2);
            while (pos2 != NPOS)
            {
                if (pos2 <= pos1 && pos2+country2.length() >= pos1+country1.length())
                    r = true;
                pos2 = NStr::FindNoCase(phrase,country2,pos2+country2.length());
            }
        }
    }
    return r;
}

bool CCountries::ContainsMultipleCountryNames (const string &phrase)
{
    int num_matches = 0;
    ITERATE ( TCStrSet, c, s_CountriesSet )
    {
        string country(*c);
        size_t pos = NStr::FindNoCase(phrase,country);
        while (pos != NPOS)
        {
            if (!((pos+country.length()<phrase.length() && isalpha(phrase[pos+country.length()]))
                  || (pos > 0 && isalpha(phrase[pos-1]))
                  || IsSubstringOfStringInList(phrase,country,pos)))
                num_matches++;
            pos = NStr::FindNoCase(phrase,country,pos+country.length());
        }

    }
    return (num_matches > 1);
}

string CCountries::GetCorrectedCountryCapitalization(const string& country)
{
    string output = country;
    ITERATE ( TCStrSet, it, s_CountriesSet ) {
        if ( NStr::EqualNocase(country, *it) ) {
            output = *it;
        }
    }
    return output;
}


void CCountries::x_RemoveDelimitersFromEnds(string& val, bool except_paren)
{
    NStr::TruncateSpacesInPlace(val);
    bool any_found = true;
    while (!val.empty() && any_found) {
        any_found = false;
        if (NStr::StartsWith(val, ",")
            || NStr::StartsWith(val, ":")
            || NStr::StartsWith(val, ".")
            || (!except_paren && NStr::StartsWith(val, ")"))) {
            val = val.substr(1);
            any_found = true;
            NStr::TruncateSpacesInPlace(val);
        } else if (NStr::EndsWith(val, ",")
            || NStr::EndsWith(val, ":")
            || (!except_paren && NStr::EndsWith(val, "("))) {
            val = val.substr(0, val.length() - 1);
            any_found = true;
            NStr::TruncateSpacesInPlace(val);
        } else if (NStr::EndsWith(val, "the") && val.length() > 3 && !isalpha((unsigned char)val[val.length() - 4])) {
            val = val.substr(0, val.length() - 4);
            any_found = true;
        } else if (NStr::EndsWith(val, ".")) {
            size_t len = val.length();
            if (len > 1 && isspace((unsigned char)val[len - 2])) {
                val = val.substr(0, val.length() - 1);
                any_found = true;
                NStr::TruncateSpacesInPlace(val);
            } else if (len > 5) {
                // make sure no spaces or punctuation within 4 characters before '.'
                bool do_remove = true;
                size_t pos = val.length() - 2;
                size_t dist = 0;
                while (dist < 4 && do_remove) {
                    if (isspace((unsigned char)val[pos]) || ispunct((unsigned char)val[pos])) {
                        do_remove = false;
                    }
                    pos--;
                    dist++;
                }
                if (do_remove) {
                    val = val.substr(0, val.length() - 1);
                    any_found = true;
                }
            }
        }
    }
}


vector<string> CCountries::x_Tokenize(const string& val)
{
    vector<string> tokens;
    NStr::Split(val, ",:()", tokens);
    // special tokenizing - if tokens contain periods but resulting token is at least four characters long
    vector<string>::iterator it = tokens.begin();
    while (it != tokens.end()) {
        size_t pos = NStr::Find(*it, ".");
        if (pos != NPOS  &&  pos > 3 && (*it).length() - pos > 4) {
            string first = (*it).substr(0, pos);
            string remainder = (*it).substr(pos + 1);
            size_t space_pos = NStr::Find(first, " ");
            size_t len_to_space = first.length();
            while (space_pos != NPOS) {
                first = first.substr(space_pos + 1);
                len_to_space = first.length();
                space_pos = NStr::Find(first, " ");
            }
            if (len_to_space > 4) {
                (*it) = (*it).substr(0, pos);
                it = tokens.insert(it, remainder);
            } else {
                it++;
            }
        } else {
            it++;
        }
    }
    return tokens;
}


bool s_ContainsWholeWord(const CTempString test, const CTempString word, NStr::ECase case_sense)
{
    size_t start = 0;
    size_t tlen = test.length();
    size_t wlen = word.length();

    size_t pos = NStr::Find(test, word, case_sense);
    while (pos != NPOS) {
        size_t p = start + pos;
        if ( (p == 0           || !isalpha((unsigned char)test[p - 1]))  &&
             (p + wlen >= tlen || !isalpha((unsigned char)test[p + wlen])) ) {
            return true;
        }
        start = p + 1;
        pos = NStr::Find(CTempString(test, start, tlen - start), word, case_sense);
    }
    return false;
}


bool s_SuppressCountryFix(const string& test)
{
    if (s_ContainsWholeWord(test, "Sea", NStr::eNocase)) {
        return true;
    } else if (s_ContainsWholeWord(test, "USSR", NStr::eNocase)) {
        return true;
    }
    return false;
}


void CCountries::x_FindCountryName
(const TCStringPairsMap& fix_map,
 const vector<string>& countries,
 string& valid_country,
 string& orig_valid_country,
 bool& too_many_countries,
 bool& bad_cap)
{
    for (auto country : countries) {
        if (!country.empty() && !too_many_countries)
        {
            string check = country;
            NStr::TruncateSpacesInPlace(check);
            x_RemoveDelimitersFromEnds(check);

            bool check_has_bad_cap = false;
            if (IsValid(check,check_has_bad_cap))
            {
                if (valid_country.empty())
                {
                    valid_country = check;
                    orig_valid_country = check;
                    bad_cap = check_has_bad_cap;
                }
                else
                {
                    too_many_countries = true;
                }
            }
            else // see if this is a fixable country
            {
                TCStringPairsMap::const_iterator found = fix_map.find(check.c_str());
                if (found != fix_map.end())
                {
                    if (valid_country.empty())
                    {
                        valid_country = found->second;
                        orig_valid_country = check;
                    }
                    else
                    {
                        too_many_countries = true;
                    }
                }
            }
        }
    }
}

// start of RW-1278

bool s_CompressRunsOfSpaces(string& val)
{
    if (val.length() == 0) return false;

    char * str = new char[sizeof(char) * (val.length() + 1)];
    strcpy(str, val.c_str());

    unsigned char    ch;    /* to use 8bit characters in multibyte languages */
    unsigned char    pv;    /* to use 8bit characters in multibyte languages */
    char *  dst;
    char *  ptr;

    dst = str;
    ptr = str;
    ch = *ptr;
    pv = '\0';
    while (ch != '\0') {
        *dst = ch;
        dst++;
        ptr++;
        pv = ch;
        ch = *ptr;
        if (pv == ' ') {
            while (ch == ' ') {
              ptr++;
              ch = *ptr;
            }
            pv = '\0';
        }
    }
    if (dst != NULL) {
        *dst = '\0';
    }

    string new_val;
    new_val = str;
    delete[] str;

    if (!NStr::Equal(val, new_val)) {
        val = new_val;
        return true;
    }
    else {
        return false;
    }
}

typedef SStaticPair<const char*, const char*> TParishMapEntry;
static const TParishMapEntry parish_abbrev_array[] = {
    { "Acadia Parish",               "Acadia Parish"               },
    { "AcadiaParish",                "Acadia Parish"               },
    { "Allen Parish",                "Allen Parish"                },
    { "AllenParish",                 "Allen Parish"                },
    { "Ascension Parish",            "Ascension Parish"            },
    { "AscensionParish",             "Ascension Parish"            },
    { "Assumption Parish",           "Assumption Parish"           },
    { "AssumptionParish",            "Assumption Parish"           },
    { "Avoyelles Parish",            "Avoyelles Parish"            },
    { "AvoyellesParish",             "Avoyelles Parish"            },
    { "Beauregard Parish",           "Beauregard Parish"           },
    { "BeauregardParish",            "Beauregard Parish"           },
    { "Bienville Parish",            "Bienville Parish"            },
    { "BienvilleParish",             "Bienville Parish"            },
    { "Bossier Parish",              "Bossier Parish"              },
    { "BossierParish",               "Bossier Parish"              },
    { "Caddo Parish",                "Caddo Parish"                },
    { "CaddoParish",                 "Caddo Parish"                },
    { "Calcasieu Parish",            "Calcasieu Parish"            },
    { "CalcasieuParish",             "Calcasieu Parish"            },
    { "Caldwell Parish",             "Caldwell Parish"             },
    { "CaldwellParish",              "Caldwell Parish"             },
    { "Cameron Parish",              "Cameron Parish"              },
    { "CameronParish",               "Cameron Parish"              },
    { "Catahoula Parish",            "Catahoula Parish"            },
    { "CatahoulaParish",             "Catahoula Parish"            },
    { "Claiborne Parish",            "Claiborne Parish"            },
    { "ClaiborneParish",             "Claiborne Parish"            },
    { "Concordia Parish",            "Concordia Parish"            },
    { "ConcordiaParish",             "Concordia Parish"            },
    { "DeSoto Parish",               "DeSoto Parish"               },
    { "DeSotoParish",                "DeSoto Parish"               },
    { "East Baton Rouge Parish",     "East Baton Rouge Parish"     },
    { "East Carroll Parish",         "East Carroll Parish"         },
    { "East Feliciana Parish",       "East Feliciana Parish"       },
    { "EastBatonRougeParish",        "East Baton Rouge Parish"     },
    { "EastCarrollParish",           "East Carroll Parish"         },
    { "EastFelicianaParish",         "East Feliciana Parish"       },
    { "Evangeline Parish",           "Evangeline Parish"           },
    { "EvangelineParish",            "Evangeline Parish"           },
    { "Franklin Parish",             "Franklin Parish"             },
    { "FranklinParish",              "Franklin Parish"             },
    { "Grant Parish",                "Grant Parish"                },
    { "GrantParish",                 "Grant Parish"                },
    { "Iberia Parish",               "Iberia Parish"               },
    { "IberiaParish",                "Iberia Parish"               },
    { "Iberville Parish",            "Iberville Parish"            },
    { "IbervilleParish",             "Iberville Parish"            },
    { "Jackson Parish",              "Jackson Parish"              },
    { "JacksonParish",               "Jackson Parish"              },
    { "Jefferson Davis Parish",      "Jefferson Davis Parish"      },
    { "Jefferson Parish",            "Jefferson Parish"            },
    { "JeffersonDavisParish",        "Jefferson Davis Parish"      },
    { "JeffersonParish",             "Jefferson Parish"            },
    { "Lafayette Parish",            "Lafayette Parish"            },
    { "LafayetteParish",             "Lafayette Parish"            },
    { "Lafourche Parish",            "Lafourche Parish"            },
    { "LafourcheParish",             "Lafourche Parish"            },
    { "LaSalle Parish",              "LaSalle Parish"              },
    { "LaSalleParish",               "LaSalle Parish"              },
    { "Lincoln Parish",              "Lincoln Parish"              },
    { "LincolnParish",               "Lincoln Parish"              },
    { "Livingston Parish",           "Livingston Parish"           },
    { "LivingstonParish",            "Livingston Parish"           },
    { "Madison Parish",              "Madison Parish"              },
    { "MadisonParish",               "Madison Parish"              },
    { "Morehouse Parish",            "Morehouse Parish"            },
    { "MorehouseParish",             "Morehouse Parish"            },
    { "Natchitoches Parish",         "Natchitoches Parish"         },
    { "NatchitochesParish",          "Natchitoches Parish"         },
    { "Orleans Parish",              "Orleans Parish"              },
    { "OrleansParish",               "Orleans Parish"              },
    { "Ouachita Parish",             "Ouachita Parish"             },
    { "OuachitaParish",              "Ouachita Parish"             },
    { "Plaquemines Parish",          "Plaquemines Parish"          },
    { "PlaqueminesParish",           "Plaquemines Parish"          },
    { "Pointe Coupee Parish",        "Pointe Coupee Parish"        },
    { "PointeCoupeeParish",          "Pointe Coupee Parish"        },
    { "Rapides Parish",              "Rapides Parish"              },
    { "RapidesParish",               "Rapides Parish"              },
    { "Red River Parish",            "Red River Parish"            },
    { "RedRiverParish",              "Red River Parish"            },
    { "Richland Parish",             "Richland Parish"             },
    { "RichlandParish",              "Richland Parish"             },
    { "Sabine Parish",               "Sabine Parish"               },
    { "SabineParish",                "Sabine Parish"               },
    { "St. Bernard Parish",          "St. Bernard Parish"          },
    { "St. Charles Parish",          "St. Charles Parish"          },
    { "St. Helena Parish",           "St. Helena Parish"           },
    { "St. James Parish",            "St. James Parish"            },
    { "St. John the Baptist Parish", "St. John the Baptist Parish" },
    { "St. Landry Parish",           "St. Landry Parish"           },
    { "St. Martin Parish",           "St. Martin Parish"           },
    { "St. Mary Parish",             "St. Mary Parish"             },
    { "St. Tammany Parish",          "St. Tammany Parish"          },
    { "St.BernardParish",            "St. Bernard Parish"          },
    { "St.CharlesParish",            "St. Charles Parish"          },
    { "St.HelenaParish",             "St. Helena Parish"           },
    { "St.JamesParish",              "St. James Parish"            },
    { "St.JohntheBaptistParish",     "St. John the Baptist Parish" },
    { "St.LandryParish",             "St. Landry Parish"           },
    { "St.MartinParish",             "St. Martin Parish"           },
    { "St.MaryParish",               "St. Mary Parish"             },
    { "St.TammanyParish",            "St. Tammany Parish"          },
    { "Tangipahoa Parish",           "Tangipahoa Parish"           },
    { "TangipahoaParish",            "Tangipahoa Parish"           },
    { "Tensas Parish",               "Tensas Parish"               },
    { "TensasParish",                "Tensas Parish"               },
    { "Terrebonne Parish",           "Terrebonne Parish"           },
    { "TerrebonneParish",            "Terrebonne Parish"           },
    { "Union Parish",                "Union Parish"                },
    { "UnionParish",                 "Union Parish"                },
    { "Vermilion Parish",            "Vermilion Parish"            },
    { "VermilionParish",             "Vermilion Parish"            },
    { "Vernon Parish",               "Vernon Parish"               },
    { "VernonParish",                "Vernon Parish"               },
    { "Washington Parish",           "Washington Parish"           },
    { "WashingtonParish",            "Washington Parish"           },
    { "Webster Parish",              "Webster Parish"              },
    { "WebsterParish",               "Webster Parish"              },
    { "West Baton Rouge Parish",     "West Baton Rouge Parish"     },
    { "West Carroll Parish",         "West Carroll Parish"         },
    { "West Feliciana Parish",       "West Feliciana Parish"       },
    { "WestBatonRougeParish",        "West Baton Rouge Parish"     },
    { "WestCarrollParish",           "West Carroll Parish"         },
    { "WestFelicianaParish",         "West Feliciana Parish"       },
    { "Winn Parish",                 "Winn Parish"                 },
    { "WinnParish",                  "Winn Parish"                 }
};

typedef CStaticPairArrayMap<const char *, const char *, PNocase_CStr> TParishMap;
DEFINE_STATIC_ARRAY_MAP(TParishMap, parishAbbrevMap, parish_abbrev_array);

bool s_IsParish ( string& parish ) {

    if ( parish.empty() ) {
        return false;
    }

    TParishMap::const_iterator parish_find_iter = parishAbbrevMap.find(parish.c_str());
    if ( parish_find_iter != parishAbbrevMap.end() ) {
        // replace with full parish name
        parish = parish_find_iter->second;
        return true;
    }

    return false;
}

typedef SStaticPair<const char*, const char*> TStateMapEntry;
static const TStateMapEntry state_abbrev_array[] = {
    { "AK",                    "Alaska"               },
    { "AL",                    "Alabama"              },
    { "Alabama",               "Alabama"              },
    { "Alaska",                "Alaska"               },
    { "American Samoa",        "American Samoa"       },
    { "AR",                    "Arkansas"             },
    { "Arizona",               "Arizona"              },
    { "Arkansas",              "Arkansas"             },
    { "AS",                    "American Samoa"       },
    { "AZ",                    "Arizona"              },
    { "CA",                    "California"           },
    { "California",            "California"           },
    { "CO",                    "Colorado"             },
    { "Colorado",              "Colorado"             },
    { "Connecticut",           "Connecticut"          },
    { "CT",                    "Connecticut"          },
    { "DC",                    "District of Columbia" },
    { "DE",                    "Delaware"             },
    { "Delaware",              "Delaware"             },
    { "District of Columbia",  "District of Columbia" },
    { "FL",                    "Florida"              },
    { "Florida",               "Florida"              },
    { "GA",                    "Georgia"              },
    { "Georgia",               "Georgia"              },
    { "GU",                    "Guam"                 },
    { "Guam",                  "Guam"                 },
    { "Hawaii",                "Hawaii"               },
    { "HI",                    "Hawaii"               },
    { "IA",                    "Iowa"                 },
    { "ID",                    "Idaho"                },
    { "Idaho",                 "Idaho"                },
    { "IL",                    "Illinois"             },
    { "Illinois",              "Illinois"             },
    { "IN",                    "Indiana"              },
    { "Indiana",               "Indiana"              },
    { "Iowa",                  "Iowa"                 },
    { "Kansas",                "Kansas"               },
    { "Kentucky",              "Kentucky"             },
    { "KS",                    "Kansas"               },
    { "KY",                    "Kentucky"             },
    { "LA",                    "Louisiana"            },
    { "Louisiana",             "Louisiana"            },
    { "MA",                    "Massachusetts"        },
    { "Maine",                 "Maine"                },
    { "Maryland",              "Maryland"             },
    { "Massachusetts",         "Massachusetts"        },
    { "MD",                    "Maryland"             },
    { "ME",                    "Maine"                },
    { "MI",                    "Michigan"             },
    { "Michigan",              "Michigan"             },
    { "Minnesota",             "Minnesota"            },
    { "Mississippi",           "Mississippi"          },
    { "Missouri",              "Missouri"             },
    { "MN",                    "Minnesota"            },
    { "MO",                    "Missouri"             },
    { "Montana",               "Montana"              },
    { "MS",                    "Mississippi"          },
    { "MT",                    "Montana"              },
    { "NC",                    "North Carolina"       },
    { "ND",                    "North Dakota"         },
    { "NE",                    "Nebraska"             },
    { "Nebraska",              "Nebraska"             },
    { "Nevada",                "Nevada"               },
    { "New Hampshire",         "New Hampshire"        },
    { "New Jersey",            "New Jersey"           },
    { "New Mexico",            "New Mexico"           },
    { "New York",              "New York"             },
    { "NH",                    "New Hampshire"        },
    { "NJ",                    "New Jersey"           },
    { "NM",                    "New Mexico"           },
    { "North Carolina",        "North Carolina"       },
    { "North Dakota",          "North Dakota"         },
    { "NV",                    "Nevada"               },
    { "NY",                    "New York"             },
    { "OH",                    "Ohio"                 },
    { "Ohio",                  "Ohio"                 },
    { "OK",                    "Oklahoma"             },
    { "Oklahoma",              "Oklahoma"             },
    { "OR",                    "Oregon"               },
    { "Oregon",                "Oregon"               },
    { "PA",                    "Pennsylvania"         },
    { "Pennsylvania",          "Pennsylvania"         },
    { "PR",                    "Puerto Rico"          },
    { "Puerto Rico",           "Puerto Rico"          },
    { "Rhode Island",          "Rhode Island"         },
    { "RI",                    "Rhode Island"         },
    { "SC",                    "South Carolina"       },
    { "SD",                    "South Dakota"         },
    { "South Carolina",        "South Carolina"       },
    { "South Dakota",          "South Dakota"         },
    { "Tennessee",             "Tennessee"            },
    { "Texas",                 "Texas"                },
    { "TN",                    "Tennessee"            },
    { "TX",                    "Texas"                },
    { "US Virgin Islands",     "US Virgin Islands"    },
    { "UT",                    "Utah"                 },
    { "Utah",                  "Utah"                 },
    { "VA",                    "Virginia"             },
    { "Vermont",               "Vermont"              },
    { "VI",                    "US Virgin Islands"    },
    { "Virgin Islands",        "US Virgin Islands"    },
    { "Virginia",              "Virginia"             },
    { "VT",                    "Vermont"              },
    { "WA",                    "Washington"           },
    { "Washington",            "Washington"           },
    { "West Virginia",         "West Virginia"        },
    { "WI",                    "Wisconsin"            },
    { "Wisconsin",             "Wisconsin"            },
    { "WV",                    "West Virginia"        },
    { "WY",                    "Wyoming"              },
    { "Wyoming",               "Wyoming"              }
};

typedef CStaticPairArrayMap<const char *, const char *, PNocase_CStr> TStateMap;
DEFINE_STATIC_ARRAY_MAP(TStateMap, stateAbbrevMap, state_abbrev_array);

bool s_IsState ( string& state, bool& modified ) {

    if ( state.empty() ) {
        return false;
    }

    string original = state;
    string working = state;

    if ( NStr::StartsWith ( working, "State of ", NStr::eNocase )) {
          NStr::TrimPrefixInPlace ( working, "State of ", NStr::eNocase );
    }

    if ( NStr::StartsWith ( working, "Commonwealth of ", NStr::eNocase )) {
        NStr::TrimPrefixInPlace ( working, "Commonwealth of ", NStr::eNocase );
    }

    if ( NStr::EndsWith ( working, " State", NStr::eNocase )) {
        NStr::TrimSuffixInPlace ( working, " State", NStr::eNocase );
    }

    NStr::TruncateSpacesInPlace ( working );

    TStateMap::const_iterator state_find_iter = stateAbbrevMap.find(working.c_str());
    if ( state_find_iter != stateAbbrevMap.end() ) {
        // replace with full state name
        state = state_find_iter->second;
        // report conversion from two-letter, changed capitalization, or prefix/suffix removal
        if ( ! NStr::Equal ( original, state )) {
            modified = true;
        }
        return true;
    }

    return false;
}

CCountries::EStateCleanup s_DoUSAStateCleanup ( string& country ) {

    if ( country.empty() ) {
        return CCountries::e_NoResult;
    }

    // make working copy
    string original = country;
    string working = country;

    // remove flanking quotation marks - if CCountries::NewFixCountry not called
    if ( NStr::StartsWith ( working, "\"" ) && NStr::EndsWith ( working, "\"" )) {
        working = working.substr ( 1, working.length() - 2 );
    }

    // remove flanking spaces
    NStr::TruncateSpacesInPlace ( working );

    // separate strings before and after colon
    string frst, scnd;
    NStr::SplitInTwo ( working, ":", frst, scnd );

    NStr::TruncateSpacesInPlace ( frst );
    NStr::TruncateSpacesInPlace ( scnd );

    // confirm that country is USA
    if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
        // if not, first try rescuing US territory
        working = CCountries::NewFixCountry(working, true);
        NStr::SplitInTwo ( working, ":", frst, scnd );
        NStr::TruncateSpacesInPlace ( frst );
        NStr::TruncateSpacesInPlace ( scnd );
        if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
            return CCountries::e_NotUSA;
        }
    }

    // split state/county/city clauses at commas
    vector<string> components;
    NStr::Split(scnd, ",", components);

    // check for only country
    if ( components.size() < 1 ) {
        country = "USA";
        return CCountries::e_Valid;
    }

    for ( size_t j = 0; j < components.size(); j++ ) {
        // remove flanking spaces around components
        NStr::TruncateSpacesInPlace ( components[j] );
        s_CompressRunsOfSpaces ( components[j] );
        // clean up runon strings like EastBatonRougeParish
        if ( NStr::EndsWith ( components[j], "Parish", NStr::eNocase )) {
            s_IsParish( components[j] );
        }
    }

    // bool any_modified = false;
    int num_states = 0;
    int match = -1;

    // string* first = 0;
    // string* last = 0;

    // has multiple components
    // int max = components.size() - 1;
    for ( int j = 0; j < components.size(); j++ ) {
        bool modified = false;
        if ( s_IsState  ( components[j], modified )) {
            /*
            if (modified) {
                any_modified = true;
            }
            */
            if ( match < 0 ) {
                // record position of first s_IsState match
                match = j;
            }
            // count successful matches
            num_states++;
            /*
            if ( j == 0 ) {
                first = &(components[j]);
            }
            if ( j == max ) {
                last = &(components[j]);
            }
            */
        }
    }

    // generate result
    string res;
    res.append ("USA: ");
    string pfx = "";

    if ( match >= 0 ) {
        // move first state matched to first position
        res.append ( components[match] );
        pfx = ", ";
    }

    for ( size_t j = 0; j < components.size(); j++ ) {
        if ( j == match) continue;
        res.append ( pfx );
        res.append ( components[j] );
        pfx = ", ";
    }

    country = res;

    if ( match < 0 ) {
        return CCountries::e_Missing;
    } else if ( num_states > 1 ) {
        return CCountries::e_Ambiguous;
    } else if ( ! NStr::Equal ( original, res )) {
        return CCountries::e_Corrected;
    }

    return CCountries::e_Valid;
}

typedef CRowReader<CRowReaderStream_NCBI_TSV> TNCBITSVStream;

static CCountries::TUsaExceptionMap exception_map;
static bool exceptions_initialized = false;

void CCountries::ReadUSAExceptionMap (CCountries::TUsaExceptionMap& exceptions, const string& exception_file ) {

    if ( ! exception_file.empty()) {

        TNCBITSVStream my_stream (exception_file);
        for ( const auto & row : my_stream ) {
            TFieldNo number_of_fields = row. GetNumberOfFields();
            if ( number_of_fields != 2 ) continue;
            string fr = row[0].Get<string>();
            string to = row[1].Get<string>();
            exceptions [fr] = to;
        }
    }
}

void CCountries::LoadUSAExceptionMap (const TUsaExceptionMap& exceptions) {

    // clear previous map
    exception_map.clear();

    // initialize internal exception map
    for ( const auto & itm : exceptions ) {
        string fr = itm.first;
        string to = itm.second;

        // ensure colon is followed by space to match initial correction
        string f1, f2;
        NStr::SplitInTwo ( fr, ":", f1, f2 );
        NStr::TruncateSpacesInPlace ( f1 );
        NStr::TruncateSpacesInPlace ( f2 );
        if ( ! f1.empty() && ! f2.empty()) {
            fr = f1 + ": " + f2;
        }

        exception_map [fr] = to;
    }

    exceptions_initialized = true;
}

void CCountries::LoadUSAExceptionMap (const string& exception_file ) {

    if ( ! exception_file.empty()) {

        TUsaExceptionMap exceptions;
        ReadUSAExceptionMap ( exceptions, exception_file );
        LoadUSAExceptionMap ( exceptions );
    }
}

string CCountries::USAStateCleanup ( const string& country, CCountries::EStateCleanup& type ) {

    // call algorithmic mapping function
    string working = country;
    type = s_DoUSAStateCleanup ( working );

    // apply exceptions from preloaded data file
    if ( exceptions_initialized ) {
        string corrected = exception_map [working];
        if ( ! corrected.empty()) {
            // presence in map here will disambiguate otherwise ambiguous name pair,
            // thus self-entries need to be added to the ambiguous state exception list
            if ( ! NStr::StartsWith ( corrected, "USA" )) {
                type = e_NotUSA;
            } else if ( NStr::Equal ( corrected, working ) && NStr::Equal ( corrected, country )) {
                type = e_Valid;
            } else {
                type = e_Corrected;
            }
            return corrected;
        }
    }

    if ( ! NStr::StartsWith ( working, "USA" )) {
        type = e_NotUSA;
    }
    return working;
}

string CCountries::USAStateCleanup ( const string& country ) {

    CCountries::EStateCleanup type = e_NoResult;
    return USAStateCleanup ( country, type );
}

// end of RW-1278

string CCountries::NewFixCountry (const string& test, bool us_territories)
{
    // change requested for JIRA:SQD-1410
    if (s_SuppressCountryFix(test)) {
        if (IsValid(test)) {
            return test;
        } else {
            return kEmptyStr;
        }
    }

    string input = test;
    if (NStr::StartsWith(input, "\"") && NStr::EndsWith(input, "\"")) {
        input = input.substr(1, input.length() - 2);
    }
    NStr::TruncateSpacesInPlace(input);

    if (NStr::EndsWith(input, ":")) {
        input = input.substr(0, input.length() - 1);
        NStr::TruncateSpacesInPlace(input);
    }

    string usa1,usa2;
    NStr::SplitInTwo(input, ":", usa1, usa2);
    if (!usa1.empty() && !usa2.empty()) {
        NStr::TruncateSpacesInPlace(usa1);
        NStr::TruncateSpacesInPlace(usa2);
        if (NStr::EqualNocase(usa1, "U.S.A.") || NStr::EqualNocase(usa1, "United States") || NStr::EqualNocase(usa1, "United States of America")) {
            input = "USA: " + usa2;
        }
    }

    auto old_name_fix = k_old_country_name_fixes.find(input.c_str());
    if (old_name_fix != k_old_country_name_fixes.end()) {
        input = old_name_fix->second;
        return input;
    }

    if (us_territories) {
        if ( NStr::StartsWith( input, "Puerto Rico", NStr::eNocase) || NStr::StartsWith( input, "Guam", NStr::eNocase) || NStr::StartsWith( input, "American Samoa", NStr::eNocase) ) {
            input = "USA: " + input;
            CCountries::ChangeExtraColonsToCommas(input);
            input = CCountries::USAStateCleanup(input);
            return input;
        } else if ( NStr::StartsWith( input, "Virgin Islands", NStr::eNocase) ) {
            input = "USA: US " + input;
            CCountries::ChangeExtraColonsToCommas(input);
            input = CCountries::USAStateCleanup(input);
            return input;
        }
    }

    if (IsValid(input)) {
        CCountries::ChangeExtraColonsToCommas(input);
        return input;
    }
    string new_country = WholeCountryFix(input);
    if (!new_country.empty())
        return new_country;

    bool too_many_countries = false;
    bool bad_cap = false;
    vector<string> countries = x_Tokenize(input);
    string valid_country;
    string orig_valid_country;

    x_FindCountryName(k_country_name_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
    if (valid_country.empty()) {
        x_FindCountryName(k_subregion_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
    }

    if (!valid_country.empty() && !too_many_countries)
        too_many_countries = ContainsMultipleCountryNames (input);

    if (!valid_country.empty() && too_many_countries && valid_country == input)
    {
        string str1,str2;
        NStr::SplitInTwo(valid_country,":",str1,str2);
        if (!str1.empty() && !str2.empty() && !NStr::StartsWith(str2," "))
            new_country = str1+": "+str2;

        CCountries::ChangeExtraColonsToCommas(new_country);
    }
    else if(!valid_country.empty() && !too_many_countries)
    {
        // find valid_country in input
        size_t pos = NStr::Find(input,orig_valid_country);
        // save preceeding string without trailing spaces or delimiters ":,"
        string before = input.substr(0,pos);

        x_RemoveDelimitersFromEnds(before);
        NStr::TruncateSpacesInPlace(before);
        // save trailing string without initial spaces or delimiters
        string after = input.substr(pos+orig_valid_country.length());
        x_RemoveDelimitersFromEnds(after, true);
        NStr::TruncateSpacesInPlace(after);
        if (bad_cap) new_country = GetCorrectedCountryCapitalization(valid_country);
        else new_country = valid_country;
        if (!before.empty() || !after.empty()) {
            if (NStr::Find(valid_country, ":") == NPOS) {
                new_country += ": ";
            } else {
                new_country += ", ";
            }
        }
        if (!before.empty())
            new_country += before;
        if (!before.empty() && !after.empty() && !NStr::Equal(after, ")"))
            new_country += ", ";
        if (!after.empty())
            new_country += after;
        CCountries::ChangeExtraColonsToCommas(new_country);
    }

    return new_country;
}


bool CCountries::ChangeExtraColonsToCommas(string& country)
{
    // requested in SQD-4516
    bool rval = false;
    int count = 0;
    for (size_t i = 0; i < country.length(); i++) {
        if (country[i] == ':') {
            count++;
            if (count > 1) {
                country[i] = ',';
                rval = true;
            }
        }
    }
    return rval;
}


string CCountries::CountryFixupItem(const string &input, bool capitalize_after_colon)
{
    string country = NewFixCountry (input);
    string new_country = country;
    SIZE_TYPE country_end_pos = NStr::Find(country,":");
    if (country_end_pos != NPOS)
    {
        SIZE_TYPE pos = country_end_pos;
        while (country[pos] == ','  ||  country[pos] == ':'  ||  isspace((unsigned char)country[pos]))
        {
            pos++;
        }
        string after = country.substr(pos);
        if (after.empty()) {
            if (pos > country_end_pos) {
                new_country = country.substr(0, country_end_pos);
            }
        } else {
            NStr::TruncateSpacesInPlace(after,NStr::eTrunc_Begin);
            if (capitalize_after_colon)
                after = CapitalizeFirstLetterOfEveryWord (after);
            new_country = country.substr(0,country_end_pos);
            new_country += ": " + after;
        }
    }
    return new_country;
}


// SubSource Qual Fixups
typedef SStaticPair<const char*, const char*> TStaticQualFixPair;
typedef CStaticPairArrayMap<const char*, const char*, PNocase_CStr> TStaticQualFixMap;

static const TStaticQualFixPair kDevStagePairs[] = {
    { "adult", "adult" },
    { "egg", "egg" },
    { "juvenile", "juvenile" },
    { "larva", "larva" }
};

DEFINE_STATIC_ARRAY_MAP(TStaticQualFixMap, sc_DevStagePairs, kDevStagePairs);


string CSubSource::FixDevStageCapitalization(const string& value)
{
    string fix = value;

    TStaticQualFixMap::const_iterator it = sc_DevStagePairs.find(value.c_str());
    if (it != sc_DevStagePairs.end()) {
        fix = it->second;
    }
    return fix;
}


static const TStaticQualFixPair kCellTypePairs[] = {
    { "hemocyte", "hemocyte" },
    { "hepatocyte", "hepatocyte" },
    { "lymphocyte", "lymphocyte" },
    { "neuroblast", "neuroblast" }
};

DEFINE_STATIC_ARRAY_MAP(TStaticQualFixMap, sc_CellTypePairs, kCellTypePairs);

string CSubSource::FixCellTypeCapitalization(const string& value)
{
    string fix = value;

    TStaticQualFixMap::const_iterator it = sc_CellTypePairs.find(value.c_str());
    if (it != sc_CellTypePairs.end()) {
        fix = it->second;
    }
    return fix;

}

DEFINE_STATIC_FAST_MUTEX(s_QualFixMutex);
typedef map<string, string, PNocase> TQualFixMap;

static TQualFixMap s_IsolationSourceMap;
static bool s_QualFixupMapsInitialized = false;

static void s_ProcessQualMapLine(const CTempString& line, TQualFixMap& qual_map)
{
    vector<CTempString> tokens;
    NStr::Split(line, "\t", tokens);
    if (tokens.size() > 1) {
        qual_map[tokens[0]] = tokens[1];
    }
}


void s_AddOneDataFile(const string& file_name, const string& data_name,
                      const char **built_in, size_t num_built_in,
                      TQualFixMap& qual_map)
{
    string file = g_FindDataFile(file_name);
    CRef<ILineReader> lr;
    if (!file.empty()) {
        try {
            lr = ILineReader::New(file);
        } NCBI_CATCH("s_InitializeQualMaps")
    }

    if (lr.Empty()) {
        if (built_in == NULL) {
            ERR_POST(Note << "No data for " + data_name);
        } else {
            if (getenv("NCBI_DEBUG")) {
                ERR_POST(Note << "Falling back on built-in data for " + data_name);
            }
            for (size_t i = 0; i < num_built_in; i++) {
                const char *p = built_in[i];
                s_ProcessQualMapLine(p, qual_map);
            }
        }
    } else {
        if (getenv("NCBI_DEBUG")) {
            ERR_POST(Note << "Reading from " + file + " for " + data_name);
        }
        do {
            s_ProcessQualMapLine(*++*lr, qual_map);
        } while (!lr->AtEOF());
    }
}

#include "isolation_sources.inc"

static void s_InitializeQualMaps(void)
{
    CFastMutexGuard GUARD(s_QualFixMutex);
    if (s_QualFixupMapsInitialized) {
        return;
    }

    // tissue types
    s_AddOneDataFile("isolation_sources.txt", "isolation sources", (const char **)k_isolation_sources, sizeof(k_isolation_sources) / sizeof(char *), s_IsolationSourceMap);
    s_QualFixupMapsInitialized = true;
}





string CSubSource::FixIsolationSourceCapitalization(const string& value)
{
    string fix = value;

    s_InitializeQualMaps();

    TQualFixMap::iterator it = s_IsolationSourceMap.find(value);
    if (it != s_IsolationSourceMap.end()) {
        return it->second;
    }

    size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
    for (size_t i = 0; i < max; i++) {
        if (NStr::EqualNocase(fix, sm_ValidSexQualifierTokens[i])) {
            fix = sm_ValidSexQualifierTokens[i];
            break;
        }
    }

    fix = COrgMod::FixHostCapitalization(fix);
    fix = FixDevStageCapitalization(fix);
    fix = FixCellTypeCapitalization(fix);

    return fix;
}


string CSubSource::FixTissueTypeCapitalization(const string& value)
{
    string fix = value;

    s_InitializeQualMaps();
    TQualFixMap::iterator it = s_IsolationSourceMap.find(value);
    if (it != s_IsolationSourceMap.end()) {
        return it->second;
    }


    size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
    for (size_t i = 0; i < max; i++) {
        if (NStr::EqualNocase(fix, sm_ValidSexQualifierTokens[i])) {
            fix = sm_ValidSexQualifierTokens[i];
            break;
        }
    }

    fix = COrgMod::FixHostCapitalization(fix);
    fix = FixDevStageCapitalization(fix);
    fix = FixCellTypeCapitalization(fix);

    return fix;
}


string CSubSource::FixLabHostCapitalization(const string& value)
{
    return COrgMod::FixHostCapitalization(value);
}


string CSubSource::FixCapitalization(TSubtype subtype, const string& value)
{
    string new_val = value;
    switch (subtype) {
        case CSubSource::eSubtype_sex:
            new_val = FixSexQualifierValue(value);
            if (NStr::IsBlank(new_val)) {
                new_val = value;
            }
            break;
        case CSubSource::eSubtype_isolation_source:
            new_val = FixIsolationSourceCapitalization(value);
            break;
        case CSubSource::eSubtype_lab_host:
            new_val = FixLabHostCapitalization(value);
            break;
        case CSubSource::eSubtype_tissue_type:
            new_val = FixTissueTypeCapitalization(value);
            break;
        case CSubSource::eSubtype_dev_stage:
            new_val = FixDevStageCapitalization(value);
            break;
        case CSubSource::eSubtype_cell_type:
            new_val = FixCellTypeCapitalization(value);
            break;
        default:
            new_val = value;
            break;
    }
    return new_val;
}


void CSubSource::FixCapitalization()
{
    if (!IsSetSubtype() || !IsSetName()) {
        return;
    }

    TSubtype subtype = GetSubtype();

    if (subtype == CSubSource::eSubtype_sex) {
        string upr = GetName();
        string lwr = upr;
        NStr::ToLower(lwr);
        if (! NStr::Equal(upr, lwr)) {
            SetName(lwr);
        }
    }

    const string& name = GetName();

    string new_val = FixCapitalization(subtype, name);

    if (!NStr::IsBlank(new_val)) {
        SetName(new_val);
    }

}


string CSubSource::AutoFix(TSubtype subtype, const string& value)
{
    string new_val;
    switch (subtype) {
        case CSubSource::eSubtype_country:
            new_val = CCountries::NewFixCountry(value);
            break;
        case CSubSource::eSubtype_collection_date:
            new_val = FixDateFormat(value);
            break;
        case CSubSource::eSubtype_lat_lon:
            new_val = FixLatLonFormat(value);
            break;
        case CSubSource::eSubtype_sex:
            new_val = FixSexQualifierValue(value);
            break;
        case CSubSource::eSubtype_altitude:
            new_val = FixAltitude(value);
            break;
        default:
            break;
    }
    return new_val;
}


void CSubSource::AutoFix()
{
    if (!IsSetSubtype() || !IsSetName()) {
        return;
    }

    TSubtype subtype = GetSubtype();
    string new_val = AutoFix(subtype, GetName());

    if (!NStr::IsBlank(new_val)) {
        SetName(new_val);
    } else if (subtype == CSubSource::eSubtype_sex) {
        string upr = GetName();
        string lwr = upr;
        NStr::ToLower(lwr);
        if (! NStr::Equal(upr, lwr)) {
            SetName(lwr);
        }
    }
}



// NOTE (for two arrays below): If string A is a prefix of string B, string B should be placed
// BEFORE string A. I.e. longer string should be earlier
static const char * s_RemovableCultureNotes[] = {
    "[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]",
    "[BankIt_uncultured16S_wizard]; [universal primers]; [dgge]",
    "[BankIt_uncultured16S_wizard]; [universal primers]",
    "[BankIt_cultured16S_wizard]",
    "[BankIt_organellerRNA_wizard]",
    "[BankIt_ITS_wizard]; [rRNAITS_notfound]",
    "[BankIt_ITS_wizard]",
    "[uncultured (using universal primers)]",
    "[uncultured (using universal primers) bacterial source]",
    "[cultured bacterial source]",
    "[enrichment culture bacterial source]",
    "[mixed bacterial source (cultured and uncultured)]",
    "[uncultured]; [universal primers]",
    "[mixed bacterial source]",
    "[virus wizard]",
    "[cDNA derived from mRNA, purified viral particles]",
    "[cDNA derived from mRNA, whole cell/tissue lysate]",
    "[cDNA derived from genomic RNA, whole cell/tissue lysate]",
    "[cDNA derived from genomic RNA, purified viral particles]",
    "[universal primers]",
    "[uncultured; wizard]",
    "[uncultured; wizard; spans unknown]",
    "[cultured; wizard]",
    "[cultured; wizard; spans unknown]",
    "[intergenic wizard]",
    "[intergenic wizard; spans unknown]",
    "[Microsatellite wizard]",
    "[Microsatellite wizard; multiple repeats]",
    "[D-loop wizard]",
    "[D-loop wizard; spans unknown]",
    "[D-loop wizard; spans known]",
    NULL
};

static const char * s_ReplaceableCultureNotes[] = {
 "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
 "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
 "[BankIt_uncultured16S_wizard]; [species_specific primers]",
 "[uncultured (with species-specific primers)]",
 "[uncultured]; [amplified with species-specific primers]",
 "[uncultured (using species-specific primers) bacterial source]",
 "[amplified with species-specific primers]",
 NULL
};


bool CSubSource::HasCultureNotes(const string& value)
{
    for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
        size_t pos = NStr::FindNoCase(value, s_RemovableCultureNotes[i]);
        if (pos != string::npos) {
            return true;
        }
    }
    for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
        if (NStr::EqualNocase(value, s_ReplaceableCultureNotes[i])) {
            return true;
        }
    }
    return false;
}


void CSubSource::RemoveCultureNotes (string& value, bool is_species_level)
{
    if (NStr::IsBlank(value)) {
        return;
    }

    for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
        string to_remove = s_RemovableCultureNotes[i];
        size_t remove_len = to_remove.length();
        size_t pos = NStr::FindNoCase(value, to_remove);
        while (pos != NPOS) {
            size_t extra_len = strspn (value.c_str() + pos + remove_len, " ;");
            value = value.substr(0, pos) + value.substr(pos + remove_len + extra_len);
            pos = NStr::FindNoCase(value, to_remove);
        }
    }
    // remove leading/trailing semicolons
    while (NStr::StartsWith(value, " ") || NStr::StartsWith(value, ";")) {
        value = value.substr(1);
    }
    while (NStr::EndsWith(value, " ") || NStr::EndsWith(value, ";")) {
        value = value.substr(0, value.length() - 1);
    }

    if (is_species_level) {
        for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
            if (NStr::EqualNocase(value, s_ReplaceableCultureNotes[i])) {
                value = "amplified with species-specific primers";
                break;
            }
        }
    }
}


void CSubSource::RemoveCultureNotes (bool is_species_level)
{
    if (IsSetName()) {
        RemoveCultureNotes(SetName(), is_species_level);
        if (NStr::IsBlank(GetName())) {
            ResetName();
        }
    }
}


// CCountryLine
CCountryLine::CCountryLine
(const string & country_name, double y, double min_x, double max_x, double scale)
: m_CountryName(country_name) ,
  m_Scale (scale)
{
    m_Y = x_ConvertLat(y);
    m_MinX = x_ConvertLon(min_x);
    m_MaxX = x_ConvertLon(max_x);

}


CCountryLine::~CCountryLine (void)
{
}


#define EPSILON 0.001

int CCountryLine::ConvertLat (double y, double scale)
{

    int  val = 0;

    if (y < -90.0) {
        y = -90.0;
    }
    if (y > 90.0) {
        y = 90.0;
    }

    if (y > 0) {
        val = (int) (y * scale + EPSILON);
    } else {
        val = (int) (-(-y * scale + EPSILON));
    }

    return val;
}


int CCountryLine::x_ConvertLat (double y)
{
    return ConvertLat(y, m_Scale);
}

int CCountryLine::ConvertLon (double x, double scale)
{

  int  val = 0;

  if (x < -180.0) {
    x = -180.0;
  }
  if (x > 180.0) {
    x = 180.0;
  }

  if (x > 0) {
    val = (int) (x * scale + EPSILON);
  } else {
    val = (int) (-(-x * scale + EPSILON));
  }

  return val;
}


int CCountryLine::x_ConvertLon (double x)
{
    return ConvertLon(x, m_Scale);
}


CCountryExtreme::CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y)
: m_CountryName(country_name) , m_MinX (min_x), m_MinY (min_y), m_MaxX(max_x), m_MaxY (max_y)
{
    m_Area = (1 + m_MaxY - m_MinY) * (1 + m_MaxX - m_MinX);
    size_t pos = NStr::Find(country_name, ":");
    if (pos == NPOS) {
        m_Level0 = country_name;
        m_Level1.clear();
    } else {
        m_Level0 = country_name.substr(0, pos);
        NStr::TruncateSpacesInPlace(m_Level0);
        m_Level1 = country_name.substr(pos + 1);
        NStr::TruncateSpacesInPlace(m_Level1);
    }

}


CCountryExtreme::~CCountryExtreme (void)
{

}


bool CCountryExtreme::SetMinX(int min_x)
{
    if (min_x < m_MinX) {
        m_MinX = min_x;
        return true;
    } else {
        return false;
    }
}


bool CCountryExtreme::SetMaxX(int max_x)
{
    if (max_x > m_MaxX) {
        m_MaxX = max_x;
        return true;
    } else {
        return false;
    }
}


bool CCountryExtreme::SetMinY(int min_y)
{
    if (min_y < m_MinY) {
        m_MinY = min_y;
        return true;
    } else {
        return false;
    }
}


bool CCountryExtreme::SetMaxY(int max_y)
{
    if (max_y > m_MaxY) {
        m_MaxY = max_y;
        return true;
    } else {
        return false;
    }
}


void CCountryExtreme::AddLine(const CCountryLine *line)
{
    if (line) {
        SetMinX(line->GetMinX());
        SetMaxX(line->GetMaxX());
        SetMinY(line->GetY());
        SetMaxY(line->GetY());
        m_Area += 1 + line->GetMaxX() - line->GetMinX();
    }
}


bool CCountryExtreme::DoesOverlap(const CCountryExtreme* other_block) const
{
    if (!other_block) {
        return false;
    } else if (m_MaxX >= other_block->GetMinX()
        && m_MaxX <= other_block->GetMaxX()
        && m_MaxY >= other_block->GetMinY()
        && m_MinY <= other_block->GetMaxY()) {
        return true;
    } else if (other_block->GetMaxX() >= m_MinX
        && other_block->GetMaxX() <= m_MaxX
        && other_block->GetMaxY() >= m_MinY
        && other_block->GetMinY() <= m_MaxY) {
        return true;
    } else {
        return false;
    }
}


bool CCountryExtreme::PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const
{
    if (!other_block) {
        return true;
    }

    // if no preferred country, these are equal
    if (NStr::IsBlank(country)) {
        return prefer_new;
    }

    // if match to preferred country
    if (NStr::EqualNocase(country, m_Level0)) {
        // if best was not preferred country, take new match
        if (!NStr::EqualNocase(country, other_block->GetLevel0())) {
            return true;
        }
        // if match to preferred province
        if (!NStr::IsBlank(province) && NStr::EqualNocase(province, m_Level1)) {
            // if best was not preferred province, take new match
            if (!NStr::EqualNocase(province, other_block->GetLevel1())) {
                return true;
            }
        }

        // if both match province, or neither does, or no preferred province, take smallest
        return prefer_new;
    }

    // if best matches preferred country, keep
    if (NStr::EqualNocase(country, other_block->GetLevel0())) {
        return false;
    }

    // otherwise take smallest
    return prefer_new;
}


CLatLonCountryId::CLatLonCountryId(float lat, float lon)
    : m_Lat(lat),
      m_Lon(lon),
      m_LandDistance(-1),
      m_WaterDistance(-1),
      m_ClaimedDistance(-1)
{}


CLatLonCountryId::TClassificationFlags CLatLonCountryId::Classify(string country, string province)
{
    CLatLonCountryId::TClassificationFlags rval = 0;

    // compare guesses or closest regions to indicated country and province
    if (!NStr::IsBlank(GetGuessCountry())) {
        // if top level countries match
        if (NStr::EqualNocase(country, GetGuessCountry())) {
            rval |= CLatLonCountryId::fCountryMatch;
            // if both are empty, still call it a match
            if (NStr::EqualNocase(province, GetGuessProvince())) {
                rval |= CLatLonCountryId::fProvinceMatch;
            }
        }
        // if they don't match, are they closest?
        if (!(rval & CLatLonCountryId::fCountryMatch)) {
            if (NStr::EqualNocase(country, GetClosestCountry())) {
                rval |= CLatLonCountryId::fCountryClosest;
                if (NStr::EqualNocase(province, GetClosestProvince())) {
                    rval |= CLatLonCountryId::fProvinceClosest;
                }
            }
        } else if (!(rval & CLatLonCountryId::fProvinceMatch) && !NStr::IsBlank(province)) {
            if (NStr::EqualNocase (province, GetClosestProvince())) {
                rval |= CLatLonCountryId::fProvinceClosest;
            }
        }
    }

    if (!NStr::IsBlank(GetGuessWater())) {
        // was the non-approved body of water correctly indicated?
        if (NStr::EqualNocase(country, GetGuessWater())) {
            rval |= CLatLonCountryId::fWaterMatch;
        } else if (NStr::EqualNocase(country, GetClosestWater())) {
            rval |= CLatLonCountryId::fWaterClosest;
        }
    }

    if (!NStr::IsBlank(GetClosestCountry()) && NStr::EqualNocase(country, GetClosestCountry())) {
        if (NStr::IsBlank(GetGuessCountry()) && NStr::IsBlank(GetGuessWater())) {
            rval |= CLatLonCountryId::fCountryMatch;
            SetGuessCountry(GetClosestCountry());
            SetFullGuess(GetClosestCountry());
            if (!NStr::IsBlank(GetClosestProvince()) && NStr::EqualNocase(province, GetClosestProvince())) {
                rval |= CLatLonCountryId::fProvinceMatch;
                SetGuessProvince(GetClosestProvince());
                SetFullGuess(GetClosestFull());
            }
        } else {
            rval |= CLatLonCountryId::fCountryClosest;
            if (!NStr::IsBlank(GetClosestProvince()) && NStr::EqualNocase(province, GetClosestProvince())) {
                rval |= CLatLonCountryId::fProvinceClosest;
            }
        }
    }
    return rval;
}


CLatLonCountryId::~CLatLonCountryId(void)
{
}


#include "lat_lon_country.inc"
static const size_t k_NumLatLonCountryText = ArraySize(s_DefaultLatLonCountryText);

#include "lat_lon_water.inc"
static const size_t k_NumLatLonWaterText = ArraySize(s_DefaultLatLonWaterText);

void CLatLonCountryMap::x_InitFromDefaultList(const char * const *list, int num)
{
    if (getenv("NCBI_DEBUG")) {
        ERR_POST(Note << "Falling back on built-in data for latlon / water data.");
    }
      // initialize list of country lines
    m_CountryLineList.clear();
    m_Scale = 20.0;
    string current_country;

    for (int i = 0; i < num; i++) {
        CTempString line = list[i];
        if (line[0] == '-') {
            // skip comment
        } else if (isalpha ((unsigned char)line[0])) {
            current_country = line;
        } else if (isdigit ((unsigned char)line[0])) {
            m_Scale = NStr::StringToDouble(line);
        } else {
            vector<string> tokens;
             NStr::Split(line, "\t", tokens);
            if (tokens.size() > 3) {
                double x = NStr::StringToDouble(tokens[1]);
                for (size_t j = 2; j < tokens.size() - 1; j+=2) {
                    m_CountryLineList.push_back(new CCountryLine(current_country, x, NStr::StringToDouble(tokens[j]), NStr::StringToDouble(tokens[j + 1]), m_Scale));
                }
            }
        }
    }
}




bool CLatLonCountryMap::x_InitFromFile(const string& filename)
{
    string fname = g_FindDataFile (filename);
    if (NStr::IsBlank (fname)) {
        return false;
    }
    if (getenv("NCBI_DEBUG")) {
        ERR_POST(Note << "Reading from " + filename + " for latlon/water data.");
    }
    CRef<ILineReader> lr = ILineReader::New (fname);
    if (lr.Empty()) {
        return false;
    } else {
        m_Scale = 20.0;
        string current_country;

        // make sure to clear before using.  in this outer
        // scope in the interest of speed (avoid repeated
        // construction/destruction)
        vector<SIZE_TYPE> tab_positions;

        do {
            // const string& line = *++*lr;
            CTempString line = *++*lr;
            if (line[0] == '-') {
                // skip comment
            } else if (isalpha ((unsigned char)line[0])) {
                current_country = line;
            } else if (isdigit ((unsigned char)line[0])) {
                m_Scale = NStr::StringToDouble(line);
            } else {
                // NStr::Tokenize would be much simpler, but
                // it's just too slow in this case, especially
                // in debug mode.

                // for the future, if we need even more speed,
                // it should be possible to eliminate the tab_positions
                // vector and collect tab positions on the fly without
                // any heap-allocated memory

                // find position of all tabs on this line
                tab_positions.clear();
                SIZE_TYPE tab_pos = line.find('\t');
                while( tab_pos != NPOS ) {
                    tab_positions.push_back(tab_pos);
                    tab_pos = line.find('\t', tab_pos+1);
                }
                // an imaginary sentinel tab
                tab_positions.push_back(line.length());

                const char * line_start = line.data();
                if( tab_positions.size() >= 4 ) {
                    CTempString y_str( line_start + tab_positions[0]+1, tab_positions[1] - tab_positions[0] - 1 );
                    double y = NStr::StringToDouble( y_str );

                    // convert into line list
                    for (size_t j = 1; j < tab_positions.size() - 2; j+=2) {
                        const SIZE_TYPE pos1 = tab_positions[j];
                        const SIZE_TYPE pos2 = tab_positions[j+1];
                        const SIZE_TYPE pos3 = tab_positions[j+2];
                        CTempString first_num( line_start + pos1 + 1, pos2 - pos1 - 1 );
                        CTempString second_num( line_start + pos2 + 1, pos3 - pos2 - 1 );
                        m_CountryLineList.push_back(new CCountryLine(current_country, y, NStr::StringToDouble(first_num), NStr::StringToDouble(second_num), m_Scale));
                    }
                }
            }
        } while ( !lr->AtEOF() );

        return true;
    }
}

bool
CLatLonCountryMap::s_CompareTwoLinesByLatLonOnly(
    const CCountryLine* line1,
    const CCountryLine* line2)
{
    if (line1->GetY() < line2->GetY()) {
        return true;
    } else if (line1->GetY() > line2->GetY()) {
        return false;
    } else {
        if (line1->GetMinX() < line2->GetMinX()) {
            return true;
        } else {
            return false;
        }
    }
}

bool CLatLonCountryMap::
        s_CompareTwoLinesByCountry(const CCountryLine* line1,
                                    const CCountryLine* line2)
{
    int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
    if (cmp == 0) {
        return s_CompareTwoLinesByLatLonOnly(line1, line2);
    } else if (cmp < 0) {
        return true;
    } else {
        return false;
    }
}


bool CLatLonCountryMap::
        s_CompareTwoLinesByLatLonThenCountry(const CCountryLine* line1,
                                    const CCountryLine* line2)
{
    if (line1->GetY() < line2->GetY()) {
        return true;
    } else if (line1->GetY() > line2->GetY()) {
        return false;
    } if (line1->GetMinX() < line2->GetMinX()) {
        return true;
    } else if (line1->GetMinX() > line2->GetMinX()) {
        return false;
    } else if (line1->GetMaxX() < line2->GetMaxX()) {
        return true;
    } else if (line1->GetMaxX() > line2->GetMaxX()) {
        return false;
    } else {
        int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
        if (cmp < 0) {
            return true;
        } else {
            return false;
        }
    }
}


CLatLonCountryMap::CLatLonCountryMap (bool is_water)
{
    // initialize list of country lines
    m_CountryLineList.clear();

    const char* env_val = getenv("NCBI_LAT_LON_DATA_PATH");
    string data_path;
    if (env_val) {
        data_path = (string) env_val;
        if (! NStr::EndsWith(data_path, "/")) {
            data_path = data_path + "/";
        }
    }

    if (is_water) {
        if (!x_InitFromFile("lat_lon_water.txt")) {
            if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_water.txt")) {
                x_InitFromDefaultList(s_DefaultLatLonWaterText, k_NumLatLonWaterText);
            }
        }
    } else {
        if (!x_InitFromFile("lat_lon_country.txt")) {
            if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_country.txt")) {
                x_InitFromDefaultList(s_DefaultLatLonCountryText, k_NumLatLonCountryText);
            }
        }
    }

    // Instead of doing a plain sort, we take advantage of the fact that
    // there are few unique country names versus the number
    // of lines.
    typedef map<CTempString, TCountryLineList, PNocase> TCountryToLinesMap;
    // this map maps a country name (case insens) to all the lines that
    // belong to that country.
    TCountryToLinesMap countryToLinesMap;
    ITERATE(TCountryLineList, line_it, m_CountryLineList) {
        countryToLinesMap[(*line_it)->GetCountry()].push_back(*line_it);
    }

    // build new m_CountryLineList here:
    TCountryLineList new_country_line_list;
    NON_CONST_ITERATE(TCountryToLinesMap, country_lines_it, countryToLinesMap)
    {
        // sort the lines for each country by lat/lon only, since we've already
        // implicitly sorted by country in countryToLinesMap
        TCountryLineList & line_list_for_this_country =
            country_lines_it->second;
        stable_sort(
            BEGIN_COMMA_END(line_list_for_this_country),
            s_CompareTwoLinesByLatLonOnly);
        copy(BEGIN_COMMA_END(line_list_for_this_country),
             back_inserter(new_country_line_list));
    }
    // swap should be constant time
    m_CountryLineList.swap(new_country_line_list);

    // set up extremes index and copy into LatLon index
    m_CountryExtremes.clear();
    m_LatLonSortedList.clear();
      size_t i, ext = 0;

    for (i = 0; i < m_CountryLineList.size(); i++) {
        if (ext > 0 && NStr::Equal(m_CountryLineList[i]->GetCountry(), m_CountryExtremes[ext - 1]->GetCountry())) {
            m_CountryExtremes[ext - 1]->AddLine(m_CountryLineList[i]);
        } else {
            m_CountryExtremes.push_back(new CCountryExtreme(m_CountryLineList[i]->GetCountry(),
                                                m_CountryLineList[i]->GetMinX(),
                                                m_CountryLineList[i]->GetY(),
                                                m_CountryLineList[i]->GetMaxX(),
                                                m_CountryLineList[i]->GetY()));
            ext++;
        }
        m_LatLonSortedList.push_back(m_CountryLineList[i]);
        m_CountryLineList[i]->SetBlock(m_CountryExtremes[ext - 1]);
    }
    sort (m_LatLonSortedList.begin(), m_LatLonSortedList.end(), s_CompareTwoLinesByLatLonThenCountry);

}


CLatLonCountryMap::~CLatLonCountryMap (void)
{
      size_t i;

    for (i = 0; i < m_CountryLineList.size(); i++) {
        delete (m_CountryLineList[i]);
    }
    m_CountryLineList.clear();

    for (i = 0; i < m_CountryExtremes.size(); i++) {
        delete (m_CountryExtremes[i]);
    }
    m_CountryExtremes.clear();
    // note - do not delete items in m_LatLonSortedList, they are pointing to the same objects as m_CountryLineList
    m_LatLonSortedList.clear();
}


bool CLatLonCountryMap::IsCountryInLatLon(const string& country, double lat,
                                          double lon) const
{
    int x = CCountryLine::ConvertLon(lon, m_Scale);
    int y = CCountryLine::ConvertLat(lat, m_Scale);

    size_t L, R, mid;

    L = 0;
    R = m_CountryLineList.size() - 1;
    mid = 0;

    while (L < R) {
        mid = (L + R) / 2;
        int cmp = NStr::Compare(m_CountryLineList[mid]->GetCountry(), country);
        if (cmp < 0) {
            L = mid + 1;
        } else if (cmp > 0) {
            R = mid;
        } else {
            while (mid > 0
                   && NStr::Compare(m_CountryLineList[mid - 1]->GetCountry(), country) == 0
                   && m_CountryLineList[mid - 1]->GetY() >= y) {
                mid--;
            }
            L = mid;
            R = mid;
        }
    }

    while (R < m_CountryLineList.size()
           && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
           && m_CountryLineList[R]->GetY() < y) {
        R++;
    }

    while (R < m_CountryLineList.size()
           && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
           && m_CountryLineList[R]->GetY() == y
           && m_CountryLineList[R]->GetMaxX() < x) {
        R++;
    }
    if (R < m_CountryLineList.size()
           && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
           && m_CountryLineList[R]->GetY() == y
           && m_CountryLineList[R]->GetMinX() <= x
           && m_CountryLineList[R]->GetMaxX() >= x) {
        return true;
    } else {
        return false;
    }
}


const CCountryExtreme *
CLatLonCountryMap::x_FindCountryExtreme(const string& country) const
{
    size_t L, R, mid;

    if (NStr::IsBlank (country)) return NULL;

    L = 0;
    R = m_CountryExtremes.size() - 1;

    while (L < R) {
        mid = (L + R) / 2;
        if (NStr::CompareNocase(m_CountryExtremes[mid]->GetCountry(), country) < 0) {
            L = mid + 1;
        } else {
            R = mid;
        }
    }
    if (!NStr::EqualNocase(m_CountryExtremes[R]->GetCountry(), country)) {
        return NULL;
    } else {
        return m_CountryExtremes[R];
    }
}


bool CLatLonCountryMap::HaveLatLonForRegion(const string& region) const
{
    if (x_FindCountryExtreme(region) == NULL) {
        return false;
    } else {
        return true;
    }
}


size_t CLatLonCountryMap::x_GetLatStartIndex (int y) const
{
    size_t L, R, mid;

    L = 0;
    R = m_LatLonSortedList.size() - 1;
    mid = 0;

    while (L < R) {
        mid = (L + R) / 2;
        if (m_LatLonSortedList[mid]->GetY() < y) {
            L = mid + 1;
        } else if (m_LatLonSortedList[mid]->GetY() > y) {
            R = mid;
        } else {
            while (mid > 0 && m_LatLonSortedList[mid - 1]->GetY() == y) {
                mid--;
            }
            L = mid;
            R = mid;
        }
    }
    return R;
}


const CCountryExtreme *
CLatLonCountryMap::GuessRegionForLatLon(double lat, double lon,
                                        const string& country,
                                        const string& province) const
{
    int x = CCountryLine::ConvertLon(lon, m_Scale);
    int y = CCountryLine::ConvertLon(lat, m_Scale);

    size_t R = x_GetLatStartIndex(y);

    const CCountryExtreme *best = NULL;

    while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() == y) {
            if (m_LatLonSortedList[R]->GetMinX() <= x
            && m_LatLonSortedList[R]->GetMaxX() >= x) {
            const CCountryExtreme *other = m_LatLonSortedList[R]->GetBlock();
            if (best == NULL) {
                best = other;
            } else if (!best->PreferTo(other, country, province, (bool)(best->GetArea() <= other->GetArea()))) {
                best = other;
            }
             }
        R++;
      }
      return best;
}


//Distance on a spherical surface calculation adapted from
//http://www.linuxjournal.com/magazine/
//work-shell-calculating-distance-between-two-latitudelongitude-points

#define EARTH_RADIUS 6371.0 /* average radius of non-spherical earth in kilometers */
#define CONST_PI 3.14159265359

static double DegreesToRadians (
  double degrees
)

{
  return (degrees * (CONST_PI / 180.0));
}

static double DistanceOnGlobe (
  double latA,
  double lonA,
  double latB,
  double lonB
)

{
  double lat1, lon1, lat2, lon2;
  double dLat, dLon, a, c;

  lat1 = DegreesToRadians (latA);
  lon1 = DegreesToRadians (lonA);
  lat2 = DegreesToRadians (latB);
  lon2 = DegreesToRadians (lonB);

  dLat = lat2 - lat1;
  dLon = lon2 - lon1;

   a = sin (dLat / 2) * sin (dLat / 2) +
       cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
   c = 2 * atan2 (sqrt (a), sqrt (1 - a));

  return (double) (EARTH_RADIUS * c);
}


double ErrorDistance (
  double latA,
  double lonA,
  double scale)
{
  double lat1, lon1, lat2, lon2;
  double dLat, dLon, a, c;

  lat1 = DegreesToRadians (latA);
  lon1 = DegreesToRadians (lonA);
  lat2 = DegreesToRadians (latA + (1.0 / scale));
  lon2 = DegreesToRadians (lonA + (1.0 / scale));

  dLat = lat2 - lat1;
  dLon = lon2 - lon1;

   a = sin (dLat / 2) * sin (dLat / 2) +
       cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
   c = 2 * atan2 (sqrt (a), sqrt (1 - a));

  return (double) (EARTH_RADIUS * c);

}


const CCountryExtreme * CLatLonCountryMap::FindClosestToLatLon(double lat,
                                                               double lon,
                                                               double range,
                                                               double &distance)
{
    int x = CCountryLine::ConvertLon(lon, m_Scale);
    int y = CCountryLine::ConvertLon(lat, m_Scale);

    int maxDelta = (int) (range * m_Scale + EPSILON);
    int min_y = y - maxDelta;
    int max_y = y + maxDelta;
    int min_x = x - maxDelta;
    int max_x = x + maxDelta;

    // binary search to lowest lat
    size_t R = x_GetLatStartIndex(min_y);

    double closest = 0.0;
    CCountryExtreme *rval = NULL;

    while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
        if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
            // out of range, don't bother calculating distance
        } else {
            double end;
            if (x < m_LatLonSortedList[R]->GetMinX()) {
                end = m_LatLonSortedList[R]->GetMinLon();
            } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
                end = m_LatLonSortedList[R]->GetMaxLon();
            } else {
                end = lon;
            }
            double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
            if (rval == NULL || closest > dist
                || (closest == dist
                    && (rval->GetArea() > m_LatLonSortedList[R]->GetBlock()->GetArea()
                        || (rval->GetArea() == m_LatLonSortedList[R]->GetBlock()->GetArea()
                            && NStr::IsBlank(rval->GetLevel1())
                            && !NStr::IsBlank(m_LatLonSortedList[R]->GetBlock()->GetLevel1()))))) {
                rval = m_LatLonSortedList[R]->GetBlock();
                closest = dist;
            }
        }
        R++;
    }
    distance = closest;
    return rval;
}


bool CLatLonCountryMap::IsClosestToLatLon(const string& comp_country,
                                          double lat, double lon,
                                          double range, double &distance) const
{
    int x = CCountryLine::ConvertLon(lon, m_Scale);
    int y = CCountryLine::ConvertLon(lat, m_Scale);

    int maxDelta = (int) (range * m_Scale + EPSILON);
    int min_y = y - maxDelta;
    int max_y = y + maxDelta;
    int min_x = x - maxDelta;
    int max_x = x + maxDelta;

    // binary search to lowest lat
    size_t R = x_GetLatStartIndex(min_y);

    string country;
    double closest = 0.0;
    int smallest_area = -1;

    while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
        if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
            // out of range, don't bother calculating distance
        } else {
            double end;
            if (x < m_LatLonSortedList[R]->GetMinX()) {
                end = m_LatLonSortedList[R]->GetMinLon();
            } else {
                end = m_LatLonSortedList[R]->GetMaxLon();
            }
            double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
            if (NStr::IsBlank (country) || closest > dist) {
                country = m_LatLonSortedList[R]->GetCountry();
                closest = dist;
                const CCountryExtreme * ext = x_FindCountryExtreme(country);
                if (ext) {
                    smallest_area = ext->GetArea();
                }
            } else if (closest == dist) {
                // if the distances are the same, prefer the input country, otherwise prefer the smaller region
                if (NStr::Equal(country, comp_country)) {
                    // keep country we're searching for
                } else if (!NStr::Equal(m_LatLonSortedList[R]->GetCountry(), country)) {
                    const CCountryExtreme * ext = x_FindCountryExtreme(m_LatLonSortedList[R]->GetCountry());
                    if (ext
                        && (ext->GetArea() < smallest_area
                            || NStr::Equal(m_LatLonSortedList[R]->GetCountry(), comp_country))) {
                        country = m_LatLonSortedList[R]->GetCountry();
                        smallest_area = ext->GetArea();
                    }
                }
            }
        }
        R++;
    }
    distance = closest;
    return NStr::Equal(country, comp_country);
}


const CCountryExtreme * CLatLonCountryMap::IsNearLatLon(double lat, double lon,
                                                        double range,
                                                        double &distance,
                                                        const string& country,
                                                        const string& province) const
{
    int x = CCountryLine::ConvertLon(lon, m_Scale);
    int y = CCountryLine::ConvertLat(lat, m_Scale);
    double closest = -1.0;
    int maxDelta = (int) (range * m_Scale + EPSILON);
    int min_y = y - maxDelta;
    int max_y = y + maxDelta;
    int min_x = x - maxDelta;
    int max_x = x + maxDelta;
    CCountryExtreme *ext = NULL;

    // binary search to lowest lat
    size_t R = x_GetLatStartIndex(min_y);

    while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
        if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
            // out of range, don't bother calculating distance
        } else if (!NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel0(), country)) {
            // wrong country, skip
        } else if (!NStr::IsBlank(province) && !NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel1(), province)) {
            // wrong province, skip
        } else {
            double end;
            if (x < m_LatLonSortedList[R]->GetMinX()) {
                end = m_LatLonSortedList[R]->GetMinLon();
            } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
                end = m_LatLonSortedList[R]->GetMaxLon();
            } else {
                end = lon;
            }
            double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
            if (closest < 0.0 ||  closest > dist) {
                closest = dist;
                ext = m_LatLonSortedList[R]->GetBlock();
            }
        }
        R++;
    }
    distance = closest;
    return ext;
}





bool CLatLonCountryMap::DoCountryBoxesOverlap(const string& country1,
                                              const string& country2) const
{
    if (NStr::IsBlank (country1) || NStr::IsBlank(country2)) return false;

    const CCountryExtreme *ext1 = x_FindCountryExtreme (country1);
    if (!ext1) {
        return false;
    }
    const CCountryExtreme *ext2 = x_FindCountryExtreme (country2);
    if (!ext2) {
        return false;
    }


    return ext1->DoesOverlap(ext2);
}


int CLatLonCountryMap::AdjustAndRoundDistance (double distance, double scale)

{
  if (scale < 1.1) {
    distance += 111.19;
  } else if (scale > 19.5 && scale < 20.5) {
    distance += 5.56;
  } else if (scale > 99.5 && scale < 100.5) {
    distance += 1.11;
  }

  return (int) (distance + 0.5);
}


int CLatLonCountryMap::AdjustAndRoundDistance (double distance) const

{
  return AdjustAndRoundDistance (distance, m_Scale);
}




END_objects_SCOPE // namespace ncbi::objects::

END_NCBI_SCOPE

/* Original file checksum: lines: 65, chars: 1891, CRC32: 7724f0c5 */
