language_tags/
lib.rs

1#![deny(
2    future_incompatible,
3    nonstandard_style,
4    rust_2018_idioms,
5    missing_docs,
6    trivial_casts,
7    trivial_numeric_casts,
8    unused_qualifications
9)]
10#![cfg_attr(test, deny(warnings))]
11
12//! Language tags can be used identify human languages, scripts e.g. Latin script, countries and
13//! other regions.
14//!
15//! Language tags are defined in [BCP47](http://tools.ietf.org/html/bcp47), an introduction is
16//! ["Language tags in HTML and XML"](http://www.w3.org/International/articles/language-tags/) by
17//! the W3C. They are commonly used in HTML and HTTP `Content-Language` and `Accept-Language`
18//! header fields.
19//!
20//! This package currently supports parsing (fully conformant parser), validation, canonicalization,
21//! formatting and comparing language tags.
22//!
23//! # Examples
24//! Create a simple language tag representing the French language as spoken
25//! in Belgium and print it:
26//!
27//! ```rust
28//! use language_tags::LanguageTag;
29//!
30//! let langtag = LanguageTag::parse("fr-BE").unwrap();
31//! assert_eq!(langtag.to_string(), "fr-BE");
32//! ```
33//!
34//! Parse a tag representing a special type of English specified by private agreement:
35//!
36//! ```rust
37//! use language_tags::LanguageTag;
38//! use std::iter::FromIterator;
39//!
40//! let langtag: LanguageTag = "en-x-twain".parse().unwrap();
41//! assert_eq!(langtag.primary_language(), "en");
42//! assert_eq!(Vec::from_iter(langtag.private_use_subtags()), vec!["twain"]);
43//! ```
44//!
45//! You can check for equality, but more often you should test if two tags match.
46//! In this example we check if the resource in German language is suitable for
47//! a user from Austria. While people speaking Austrian German normally understand
48//! standard German the opposite is not always true. So the resource can be presented
49//! to the user but if the resource was in `de-AT` and a user asked for a representation
50//! in `de` the request should be rejected.
51//!
52//!
53//! ```rust
54//! use language_tags::LanguageTag;
55//!
56//! let mut langtag_server = LanguageTag::parse("de-AT").unwrap();
57//! let mut langtag_user = LanguageTag::parse("de").unwrap();
58//! assert!(langtag_user.matches(&langtag_server));
59//! ```
60
61mod iana_registry;
62#[cfg(feature = "serde")]
63mod serde;
64
65use crate::iana_registry::*;
66use std::error::Error;
67use std::fmt;
68use std::iter::once;
69use std::ops::Deref;
70use std::str::FromStr;
71use std::str::Split;
72
73/// A language tag as described in [RFC 5646](https://tools.ietf.org/html/rfc5646).
74///
75/// Language tags are used to help identify languages, whether spoken,
76/// written, signed, or otherwise signaled, for the purpose of
77/// communication.  This includes constructed and artificial languages
78/// but excludes languages not intended primarily for human
79/// communication, such as programming languages.
80#[derive(Eq, PartialEq, Debug, Clone, Hash)]
81pub struct LanguageTag {
82    /// Syntax described in [RFC 5646 2.1](https://tools.ietf.org/html/rfc5646#section-2.1)
83    serialization: String,
84    language_end: usize,
85    extlang_end: usize,
86    script_end: usize,
87    region_end: usize,
88    variant_end: usize,
89    extension_end: usize,
90}
91
92impl LanguageTag {
93    /// Return the serialization of this language tag.
94    ///
95    /// This is fast since that serialization is already stored in the `LanguageTag` struct.
96    #[inline]
97    pub fn as_str(&self) -> &str {
98        &self.serialization
99    }
100
101    /// Return the serialization of this language tag.
102    ///
103    /// This consumes the `LanguageTag` and takes ownership of the `String` stored in it.
104    #[inline]
105    pub fn into_string(self) -> String {
106        self.serialization
107    }
108
109    /// Return the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1).
110    ///
111    /// ```
112    /// use language_tags::LanguageTag;
113    ///
114    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
115    /// assert_eq!(language_tag.primary_language(), "zh");
116    /// ```
117    #[inline]
118    pub fn primary_language(&self) -> &str {
119        &self.serialization[..self.language_end]
120    }
121
122    /// Return the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
123    ///
124    /// Valid language tags have at most one extended language.
125    ///
126    /// ```
127    /// use language_tags::LanguageTag;
128    ///
129    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
130    /// assert_eq!(language_tag.extended_language(), Some("cmn"));
131    /// ```
132    #[inline]
133    pub fn extended_language(&self) -> Option<&str> {
134        if self.language_end == self.extlang_end {
135            None
136        } else {
137            Some(&self.serialization[self.language_end + 1..self.extlang_end])
138        }
139    }
140
141    /// Iterate on the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
142    ///
143    /// Valid language tags have at most one extended language.
144    ///
145    /// ```
146    /// use language_tags::LanguageTag;
147    ///
148    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
149    /// assert_eq!(language_tag.extended_language_subtags().collect::<Vec<_>>(), vec!["cmn"]);
150    /// ```
151    #[inline]
152    pub fn extended_language_subtags(&self) -> impl Iterator<Item = &str> {
153        self.extended_language().unwrap_or("").split_terminator('-')
154    }
155
156    /// Return the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1)
157    /// and its [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2).
158    ///
159    /// ```
160    /// use language_tags::LanguageTag;
161    ///
162    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
163    /// assert_eq!(language_tag.full_language(), "zh-cmn");
164    /// ```
165    #[inline]
166    pub fn full_language(&self) -> &str {
167        &self.serialization[..self.extlang_end]
168    }
169
170    /// Return the [script subtag](https://tools.ietf.org/html/rfc5646#section-2.2.3).
171    ///
172    /// ```
173    /// use language_tags::LanguageTag;
174    ///
175    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
176    /// assert_eq!(language_tag.script(), Some("Hans"));
177    /// ```
178    #[inline]
179    pub fn script(&self) -> Option<&str> {
180        if self.extlang_end == self.script_end {
181            None
182        } else {
183            Some(&self.serialization[self.extlang_end + 1..self.script_end])
184        }
185    }
186
187    /// Return the [region subtag](https://tools.ietf.org/html/rfc5646#section-2.2.4).
188    ///
189    ///
190    /// ```
191    /// use language_tags::LanguageTag;
192    ///
193    /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap();
194    /// assert_eq!(language_tag.region(), Some("CN"));
195    /// ```
196    #[inline]
197    pub fn region(&self) -> Option<&str> {
198        if self.script_end == self.region_end {
199            None
200        } else {
201            Some(&self.serialization[self.script_end + 1..self.region_end])
202        }
203    }
204
205    /// Return the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5).
206    ///
207    /// ```
208    /// use language_tags::LanguageTag;
209    ///
210    /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap();
211    /// assert_eq!(language_tag.variant(), Some("pinyin"));
212    /// ```
213    #[inline]
214    pub fn variant(&self) -> Option<&str> {
215        if self.region_end == self.variant_end {
216            None
217        } else {
218            Some(&self.serialization[self.region_end + 1..self.variant_end])
219        }
220    }
221
222    /// Iterate on the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5).
223    ///
224    /// ```
225    /// use language_tags::LanguageTag;
226    ///
227    /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap();
228    /// assert_eq!(language_tag.variant_subtags().collect::<Vec<_>>(), vec!["pinyin"]);
229    /// ```
230    #[inline]
231    pub fn variant_subtags(&self) -> impl Iterator<Item = &str> {
232        self.variant().unwrap_or("").split_terminator('-')
233    }
234
235    /// Return the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6).
236    ///
237    /// ```
238    /// use language_tags::LanguageTag;
239    ///
240    /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap();
241    /// assert_eq!(language_tag.extension(), Some("u-co-phonebk"));
242    /// ```
243    #[inline]
244    pub fn extension(&self) -> Option<&str> {
245        if self.variant_end == self.extension_end {
246            None
247        } else {
248            Some(&self.serialization[self.variant_end + 1..self.extension_end])
249        }
250    }
251
252    /// Iterate on the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6).
253    ///
254    /// ```
255    /// use language_tags::LanguageTag;
256    ///
257    /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap();
258    /// assert_eq!(language_tag.extension_subtags().collect::<Vec<_>>(), vec![('u', "co-phonebk")]);
259    /// ```
260    #[inline]
261    pub fn extension_subtags(&self) -> impl Iterator<Item = (char, &str)> {
262        match self.extension() {
263            Some(parts) => ExtensionsIterator::new(parts),
264            None => ExtensionsIterator::new(""),
265        }
266    }
267
268    /// Return the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7).
269    ///
270    ///
271    /// ```
272    /// use language_tags::LanguageTag;
273    ///
274    /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap();
275    /// assert_eq!(language_tag.private_use(), Some("x-foo-bar"));
276    /// ```
277    #[inline]
278    pub fn private_use(&self) -> Option<&str> {
279        if self.serialization.starts_with("x-") {
280            Some(&self.serialization)
281        } else if self.extension_end == self.serialization.len() {
282            None
283        } else {
284            Some(&self.serialization[self.extension_end + 1..])
285        }
286    }
287
288    /// Iterate on the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7).
289    ///
290    /// ```
291    /// use language_tags::LanguageTag;
292    ///
293    /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap();
294    /// assert_eq!(language_tag.private_use_subtags().collect::<Vec<_>>(), vec!["foo", "bar"]);
295    /// ```
296    #[inline]
297    pub fn private_use_subtags(&self) -> impl Iterator<Item = &str> {
298        self.private_use()
299            .map(|part| &part[2..])
300            .unwrap_or("")
301            .split_terminator('-')
302    }
303
304    /// Create a `LanguageTag` from its serialization.
305    ///
306    /// This parser accepts the language tags that are "well-formed" according to
307    /// [RFC 5646](https://tools.ietf.org/html/rfc5646#section-2.2.9).
308    /// Full validation could be done with the `validate` method.
309    ///
310    /// ```rust
311    /// use language_tags::LanguageTag;
312    ///
313    /// let language_tag = LanguageTag::parse("en-us").unwrap();
314    /// assert_eq!(language_tag.into_string(), "en-US")
315    /// ```
316    ///
317    /// # Errors
318    ///
319    /// If the language tag is not "well-formed" a `ParseError` variant will be returned.
320    pub fn parse(input: &str) -> Result<Self, ParseError> {
321        //grandfathered tags
322        if let Some(tag) = GRANDFATHEREDS
323            .iter()
324            .find(|record| record.eq_ignore_ascii_case(input))
325        {
326            // grandfathered tag
327            Ok(tag_from_primary_language(*tag))
328        } else if input.starts_with("x-") || input.starts_with("X-") {
329            // private use
330            if !is_alphanumeric_or_dash(input) {
331                Err(ParseError::ForbiddenChar)
332            } else if input.len() == 2 {
333                Err(ParseError::EmptyPrivateUse)
334            } else {
335                Ok(tag_from_primary_language(input.to_ascii_lowercase()))
336            }
337        } else {
338            parse_language_tag(input)
339        }
340    }
341
342    /// Check if the language tag is "valid" according to
343    /// [RFC 5646](https://tools.ietf.org/html/rfc5646#section-2.2.9).
344    ///
345    /// It applies the following steps:
346    ///
347    /// * grandfathereds and private use tags are valid
348    /// * There should be no more than one extended language subtag
349    ///   (c.f. [errata 5457](https://www.rfc-editor.org/errata/eid5457)).
350    /// * Primary language, extended language, script, region and variants should appear
351    ///   in the IANA Language Subtag Registry.
352    /// * Extended language and variants should have a correct prefix as set
353    ///   in the IANA Language Subtag Registry.
354    /// * There should be no duplicate variant and singleton (extension) subtags.
355    ///
356    ///
357    /// # Errors
358    ///
359    /// If the language tag is not "valid" a `ValidationError` variant will be returned.
360    pub fn validate(&self) -> Result<(), ValidationError> {
361        // The tag is well-formed.
362        // always ok
363
364        // Private tag
365        if self.serialization.starts_with("x-") {
366            return Ok(());
367        }
368
369        // The tag is in the list of grandfathered tags
370        if is_in_str_slice_set(&GRANDFATHEREDS, &self.serialization) {
371            return Ok(());
372        }
373
374        // There is no more than one extended language subtag.
375        // From [errata 5457](https://www.rfc-editor.org/errata/eid5457).
376        if let Some(extended_language) = self.extended_language() {
377            if extended_language.contains('-') {
378                return Err(ValidationError::MultipleExtendedLanguageSubtags);
379            }
380        }
381
382        // all of its primary language, extended language, script, region, and variant
383        // subtags appear in the IANA Language Subtag Registry as of the
384        // particular registry date.
385        let primary_language = self.primary_language();
386        if !between(primary_language, "qaa", "qtz")
387            && !is_in_from_str_slice_set(&LANGUAGES, primary_language)
388        {
389            return Err(ValidationError::PrimaryLanguageNotInRegistry);
390        }
391        if let Some(extended_language) = self.extended_language() {
392            if let Some(extended_language_prefix) =
393                find_in_from_str_slice_map(&EXTLANGS, extended_language)
394            {
395                if !self.serialization.starts_with(extended_language_prefix) {
396                    return Err(ValidationError::WrongExtendedLanguagePrefix);
397                }
398            } else {
399                return Err(ValidationError::ExtendedLanguageNotInRegistry);
400            }
401        }
402        if let Some(script) = self.script() {
403            if !between(script, "Qaaa", "Qabx") && !is_in_from_str_slice_set(&SCRIPTS, script) {
404                return Err(ValidationError::ScriptNotInRegistry);
405            }
406        }
407        if let Some(region) = self.region() {
408            if !between(region, "QM", "QZ")
409                && !between(region, "XA", "XZ")
410                && !is_in_from_str_slice_set(&REGIONS, region)
411            {
412                return Err(ValidationError::RegionNotInRegistry);
413            }
414        }
415        for variant in self.variant_subtags() {
416            if let Some(variant_prefixes) = find_in_str_slice_map(&VARIANTS, variant) {
417                if !variant_prefixes
418                    .split(' ')
419                    .any(|prefix| self.serialization.starts_with(prefix))
420                {
421                    return Err(ValidationError::WrongVariantPrefix);
422                }
423            } else {
424                return Err(ValidationError::VariantNotInRegistry);
425            }
426        }
427
428        // There are no duplicate variant subtags.
429        let with_duplicate_variant = self.variant_subtags().enumerate().any(|(id1, variant1)| {
430            self.variant_subtags()
431                .enumerate()
432                .any(|(id2, variant2)| id1 != id2 && variant1 == variant2)
433        });
434        if with_duplicate_variant {
435            return Err(ValidationError::DuplicateVariant);
436        }
437
438        // There are no duplicate singleton (extension) subtags.
439        if let Some(extension) = self.extension() {
440            let mut seen_extensions = AlphanumericLowerCharSet::new();
441            let with_duplicate_extension = extension.split('-').any(|subtag| {
442                if subtag.len() == 1 {
443                    let extension = subtag.chars().next().unwrap();
444                    if seen_extensions.contains(extension) {
445                        true
446                    } else {
447                        seen_extensions.insert(extension);
448                        false
449                    }
450                } else {
451                    false
452                }
453            });
454            if with_duplicate_extension {
455                return Err(ValidationError::DuplicateExtension);
456            }
457        }
458
459        Ok(())
460    }
461
462    /// Check if the language tag is valid according to
463    /// [RFC 5646](https://tools.ietf.org/html/rfc5646#section-2.2.9).
464    pub fn is_valid(&self) -> bool {
465        self.validate().is_ok()
466    }
467
468    /// Returns the canonical version of the language tag following
469    /// [RFC 5646 4.5](https://tools.ietf.org/html/rfc5646#section-4.5).
470    ///
471    /// It currently applies the following steps:
472    ///
473    /// * Grandfathered tags are replaced with the canonical version if possible.
474    /// * Redundant tags are replaced with the canonical version if possible.
475    /// * Extension languages are promoted to primary language.
476    /// * Deprecated languages, scripts, regions and variants are replaced with modern equivalents.
477    /// * Suppress-Script is applied to remove default script for a language (e.g. "en-Latn" is canonicalized as "en").
478    /// * Variants are deduplicated
479    ///
480    ///
481    /// # Errors
482    ///
483    /// If there is not a unique way to canonicalize the language tag
484    /// a `ValidationError` variant will be returned.
485    pub fn canonicalize(&self) -> Result<LanguageTag, ValidationError> {
486        //We could not do anything for private use
487        if self.serialization.starts_with("x-") {
488            return Ok(self.clone());
489        }
490
491        // 2 Redundant or grandfathered tags are replaced by their 'Preferred-Value', if there is one.
492        if is_in_str_slice_set(&GRANDFATHEREDS, &self.serialization) {
493            return Ok(
494                if let Some(preferred_value) =
495                    find_in_str_slice_map(&GRANDFATHEREDS_PREFERRED_VALUE, &self.serialization)
496                {
497                    Self::parse(preferred_value).unwrap()
498                } else {
499                    self.clone()
500                },
501            );
502        }
503        if let Some(preferred_value) =
504            find_in_str_slice_map(&REDUNDANTS_PREFERRED_VALUE, &self.serialization)
505        {
506            return Ok(Self::parse(preferred_value).unwrap());
507        }
508        //TODO: what if a redundant has a some extensions/private use?
509
510        // 3.  Subtags are replaced by their 'Preferred-Value', if there is one.
511        // Primary language
512        let mut primary_language = self.primary_language();
513        if let Some(preferred_value) =
514            find_in_from_str_slice_map(&LANGUAGES_PREFERRED_VALUE, primary_language)
515        {
516            primary_language = preferred_value;
517        }
518
519        // Extended language
520        // For extlangs, the original primary language subtag is also replaced if there is a primary language subtag in the 'Preferred-Value'.
521        let mut extended_language = None;
522        if let Some(extlang) = self.extended_language() {
523            // We fail if there is more than one (no single possible canonicalization)
524            if extlang.contains('-') {
525                return Err(ValidationError::MultipleExtendedLanguageSubtags);
526            }
527            if let Some(preferred_value) =
528                find_in_from_str_slice_map(&EXTLANGS_PREFERRED_VALUE, extlang)
529            {
530                primary_language = preferred_value;
531            } else {
532                extended_language = Some(extlang);
533            }
534        }
535
536        let mut serialization = String::with_capacity(self.serialization.len());
537        serialization.push_str(primary_language);
538        let language_end = serialization.len();
539        if let Some(extended_language) = extended_language {
540            serialization.push('-');
541            serialization.push_str(extended_language);
542        }
543        let extlang_end = serialization.len();
544
545        // Script
546        if let Some(script) = self.script() {
547            let script =
548                find_in_from_str_slice_map(&SCRIPTS_PREFERRED_VALUE, script).unwrap_or(script);
549
550            // Suppress-Script
551            let match_suppress_script =
552                find_in_from_str_slice_map(&LANGUAGES_SUPPRESS_SCRIPT, primary_language)
553                    .filter(|suppress_script| *suppress_script == script)
554                    .is_some();
555            if !match_suppress_script {
556                serialization.push('-');
557                serialization.push_str(script);
558            }
559        }
560        let script_end = serialization.len();
561
562        // Region
563        if let Some(region) = self.region() {
564            serialization.push('-');
565            serialization.push_str(
566                find_in_from_str_slice_map(&REGIONS_PREFERRED_VALUE, region).unwrap_or(region),
567            );
568        }
569        let region_end = serialization.len();
570
571        // Variant
572        for variant in self.variant_subtags() {
573            let variant =
574                *find_in_str_slice_map(&VARIANTS_PREFERRED_VALUE, variant).unwrap_or(&variant);
575            let variant_already_exists = serialization.split('-').any(|subtag| subtag == variant);
576            if !variant_already_exists {
577                serialization.push('-');
578                serialization.push_str(variant);
579            }
580        }
581        let variant_end = serialization.len();
582
583        //Extension
584        // 1.  Extension sequences are ordered into case-insensitive ASCII order by singleton subtags
585        if self.extension().is_some() {
586            let mut extensions: Vec<_> = self.extension_subtags().collect();
587            extensions.sort_unstable();
588            for (k, v) in extensions {
589                serialization.push('-');
590                serialization.push(k);
591                serialization.push('-');
592                serialization.push_str(v);
593            }
594        }
595        let extension_end = serialization.len();
596
597        //Private use
598        if let Some(private_use) = self.private_use() {
599            serialization.push('-');
600            serialization.push_str(private_use);
601        }
602
603        Ok(LanguageTag {
604            serialization,
605            language_end,
606            extlang_end,
607            script_end,
608            region_end,
609            variant_end,
610            extension_end,
611        })
612    }
613
614    /// Matches language tags. The first language acts as a language range, the second one is used
615    /// as a normal language tag. None fields in the language range are ignored. If the language
616    /// tag has more extlangs than the range these extlangs are ignored. Matches are
617    /// case-insensitive.
618    ///
619    /// For example the range `en-GB` matches only `en-GB` and `en-Arab-GB` but not `en`.
620    /// The range `en` matches all language tags starting with `en` including `en`, `en-GB`,
621    /// `en-Arab` and `en-Arab-GB`.
622    ///
623    /// # Panics
624    /// If the language range has extensions or private use tags.
625    ///
626    /// # Examples
627    /// ```rust
628    /// use language_tags::LanguageTag;
629    ///
630    /// let range_italian = LanguageTag::parse("it").unwrap();
631    /// let tag_german = LanguageTag::parse("de").unwrap();
632    /// let tag_italian_switzerland = LanguageTag::parse("it-CH").unwrap();
633    /// assert!(!range_italian.matches(&tag_german));
634    /// assert!(range_italian.matches(&tag_italian_switzerland));
635    ///
636    /// let range_spanish_brazil = LanguageTag::parse("es-BR").unwrap();
637    /// let tag_spanish = LanguageTag::parse("es").unwrap();
638    /// assert!(!range_spanish_brazil.matches(&tag_spanish));
639    /// ```
640    pub fn matches(&self, other: &LanguageTag) -> bool {
641        fn matches_option(a: Option<&str>, b: Option<&str>) -> bool {
642            match (a, b) {
643                (Some(a), Some(b)) => a == b,
644                (None, _) => true,
645                (_, None) => false,
646            }
647        }
648        fn matches_iter<'a>(
649            a: impl Iterator<Item = &'a str>,
650            b: impl Iterator<Item = &'a str>,
651        ) -> bool {
652            a.zip(b).all(|(x, y)| x == y)
653        }
654        assert!(self.is_language_range());
655        self.full_language() == other.full_language()
656            && matches_option(self.script(), other.script())
657            && matches_option(self.region(), other.region())
658            && matches_iter(self.variant_subtags(), other.variant_subtags())
659    }
660
661    /// Checks if it is a language range, meaning that there are no extension and privateuse tags.
662    pub fn is_language_range(&self) -> bool {
663        self.extension().is_none() && self.private_use().is_none()
664    }
665}
666
667impl FromStr for LanguageTag {
668    type Err = ParseError;
669
670    #[inline]
671    fn from_str(input: &str) -> Result<Self, ParseError> {
672        Self::parse(input)
673    }
674}
675
676impl fmt::Display for LanguageTag {
677    #[inline]
678    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
679        f.write_str(self.as_str())
680    }
681}
682
683/// Builds a tag from its primary language
684fn tag_from_primary_language(tag: impl Into<String>) -> LanguageTag {
685    let serialization = tag.into();
686    let end = serialization.len();
687    LanguageTag {
688        serialization,
689        language_end: end,
690        extlang_end: end,
691        script_end: end,
692        region_end: end,
693        variant_end: end,
694        extension_end: end,
695    }
696}
697
698/// Handles normal tags.
699fn parse_language_tag(input: &str) -> Result<LanguageTag, ParseError> {
700    #[derive(PartialEq, Eq)]
701    enum State {
702        Start,
703        AfterLanguage,
704        AfterExtLang,
705        AfterScript,
706        AfterRegion,
707        InExtension { expected: bool },
708        InPrivateUse { expected: bool },
709    }
710
711    let mut serialization = String::with_capacity(input.len());
712
713    let mut state = State::Start;
714    let mut language_end = 0;
715    let mut extlang_end = 0;
716    let mut script_end = 0;
717    let mut region_end = 0;
718    let mut variant_end = 0;
719    let mut extension_end = 0;
720    let mut extlangs_count = 0;
721    for (subtag, end) in SubTagIterator::new(input) {
722        if subtag.is_empty() {
723            // All subtags have a maximum length of eight characters.
724            return Err(ParseError::EmptySubtag);
725        }
726        if subtag.len() > 8 {
727            // All subtags have a maximum length of eight characters.
728            return Err(ParseError::SubtagTooLong);
729        }
730        if state == State::Start {
731            // Primary language
732            if subtag.len() < 2 || !is_alphabetic(subtag) {
733                return Err(ParseError::InvalidLanguage);
734            }
735            language_end = end;
736            serialization.extend(to_lowercase(subtag));
737            if subtag.len() < 4 {
738                // extlangs are only allowed for short language tags
739                state = State::AfterLanguage;
740            } else {
741                state = State::AfterExtLang;
742            }
743        } else if let State::InPrivateUse { .. } = state {
744            if !is_alphanumeric(subtag) {
745                return Err(ParseError::InvalidSubtag);
746            }
747            serialization.push('-');
748            serialization.extend(to_lowercase(subtag));
749            state = State::InPrivateUse { expected: false };
750        } else if subtag == "x" || subtag == "X" {
751            // We make sure extension is found
752            if let State::InExtension { expected: true } = state {
753                return Err(ParseError::EmptyExtension);
754            }
755            serialization.push('-');
756            serialization.push('x');
757            state = State::InPrivateUse { expected: true };
758        } else if subtag.len() == 1 && is_alphanumeric(subtag) {
759            // We make sure extension is found
760            if let State::InExtension { expected: true } = state {
761                return Err(ParseError::EmptyExtension);
762            }
763            let extension_tag = subtag.chars().next().unwrap().to_ascii_lowercase();
764            serialization.push('-');
765            serialization.push(extension_tag);
766            state = State::InExtension { expected: true };
767        } else if let State::InExtension { .. } = state {
768            if !is_alphanumeric(subtag) {
769                return Err(ParseError::InvalidSubtag);
770            }
771            extension_end = end;
772            serialization.push('-');
773            serialization.extend(to_lowercase(subtag));
774            state = State::InExtension { expected: false };
775        } else if state == State::AfterLanguage && subtag.len() == 3 && is_alphabetic(subtag) {
776            extlangs_count += 1;
777            if extlangs_count > 3 {
778                return Err(ParseError::TooManyExtlangs);
779            }
780            // valid extlangs
781            extlang_end = end;
782            serialization.push('-');
783            serialization.extend(to_lowercase(subtag));
784        } else if (state == State::AfterLanguage || state == State::AfterExtLang)
785            && subtag.len() == 4
786            && is_alphabetic(subtag)
787        {
788            // Script
789            script_end = end;
790            serialization.push('-');
791            serialization.extend(to_uppercase_first(subtag));
792            state = State::AfterScript;
793        } else if (state == State::AfterLanguage
794            || state == State::AfterExtLang
795            || state == State::AfterScript)
796            && (subtag.len() == 2 && is_alphabetic(subtag)
797                || subtag.len() == 3 && is_numeric(subtag))
798        {
799            // Region
800            region_end = end;
801            serialization.push('-');
802            serialization.extend(to_uppercase(subtag));
803            state = State::AfterRegion;
804        } else if (state == State::AfterLanguage
805            || state == State::AfterExtLang
806            || state == State::AfterScript
807            || state == State::AfterRegion)
808            && is_alphanumeric(subtag)
809            && (subtag.len() >= 5 && is_alphabetic(&subtag[0..1])
810                || subtag.len() >= 4 && is_numeric(&subtag[0..1]))
811        {
812            // Variant
813            variant_end = end;
814            serialization.push('-');
815            serialization.extend(to_lowercase(subtag));
816            state = State::AfterRegion;
817        } else {
818            return Err(ParseError::InvalidSubtag);
819        }
820    }
821
822    //We make sure we are in a correct final state
823    if let State::InExtension { expected: true } = state {
824        return Err(ParseError::EmptyExtension);
825    }
826    if let State::InPrivateUse { expected: true } = state {
827        return Err(ParseError::EmptyPrivateUse);
828    }
829
830    //We make sure we have not skipped anyone
831    if extlang_end < language_end {
832        extlang_end = language_end;
833    }
834    if script_end < extlang_end {
835        script_end = extlang_end;
836    }
837    if region_end < script_end {
838        region_end = script_end;
839    }
840    if variant_end < region_end {
841        variant_end = region_end;
842    }
843    if extension_end < variant_end {
844        extension_end = variant_end;
845    }
846
847    Ok(LanguageTag {
848        serialization,
849        language_end,
850        extlang_end,
851        script_end,
852        region_end,
853        variant_end,
854        extension_end,
855    })
856}
857
858struct ExtensionsIterator<'a> {
859    input: &'a str,
860}
861
862impl<'a> ExtensionsIterator<'a> {
863    fn new(input: &'a str) -> Self {
864        Self { input }
865    }
866}
867
868impl<'a> Iterator for ExtensionsIterator<'a> {
869    type Item = (char, &'a str);
870
871    fn next(&mut self) -> Option<(char, &'a str)> {
872        let mut parts_iterator = self.input.split_terminator('-');
873        let singleton = parts_iterator.next()?.chars().next().unwrap();
874        let mut content_size: usize = 2;
875        for part in parts_iterator {
876            if part.len() == 1 {
877                let content = &self.input[2..content_size - 1];
878                self.input = &self.input[content_size..];
879                return Some((singleton, content));
880            } else {
881                content_size += part.len() + 1;
882            }
883        }
884        let result = self.input.get(2..).map(|content| (singleton, content));
885        self.input = "";
886        result
887    }
888}
889
890struct SubTagIterator<'a> {
891    split: Split<'a, char>,
892    position: usize,
893}
894
895impl<'a> SubTagIterator<'a> {
896    fn new(input: &'a str) -> Self {
897        Self {
898            split: input.split('-'),
899            position: 0,
900        }
901    }
902}
903
904impl<'a> Iterator for SubTagIterator<'a> {
905    type Item = (&'a str, usize);
906
907    fn next(&mut self) -> Option<(&'a str, usize)> {
908        let tag = self.split.next()?;
909        let tag_end = self.position + tag.len();
910        self.position = tag_end + 1;
911        Some((tag, tag_end))
912    }
913}
914
915struct AlphanumericLowerCharSet {
916    alphabetic_set: [bool; 26],
917    numeric_set: [bool; 10],
918}
919
920impl AlphanumericLowerCharSet {
921    fn new() -> Self {
922        Self {
923            alphabetic_set: [false; 26],
924            numeric_set: [false; 10],
925        }
926    }
927
928    fn contains(&mut self, c: char) -> bool {
929        if c.is_ascii_digit() {
930            self.numeric_set[char_sub(c, '0')]
931        } else if c.is_ascii_lowercase() {
932            self.alphabetic_set[char_sub(c, 'a')]
933        } else if c.is_ascii_uppercase() {
934            self.alphabetic_set[char_sub(c, 'A')]
935        } else {
936            false
937        }
938    }
939
940    fn insert(&mut self, c: char) {
941        if c.is_ascii_digit() {
942            self.numeric_set[char_sub(c, '0')] = true
943        } else if c.is_ascii_lowercase() {
944            self.alphabetic_set[char_sub(c, 'a')] = true
945        } else if c.is_ascii_uppercase() {
946            self.alphabetic_set[char_sub(c, 'A')] = true
947        }
948    }
949}
950
951fn char_sub(c1: char, c2: char) -> usize {
952    (c1 as usize) - (c2 as usize)
953}
954
955fn is_alphabetic(s: &str) -> bool {
956    s.chars().all(|x| x.is_ascii_alphabetic())
957}
958
959fn is_numeric(s: &str) -> bool {
960    s.chars().all(|x| x.is_ascii_digit())
961}
962
963fn is_alphanumeric(s: &str) -> bool {
964    s.chars().all(|x| x.is_ascii_alphanumeric())
965}
966
967fn is_alphanumeric_or_dash(s: &str) -> bool {
968    s.chars().all(|x| x.is_ascii_alphanumeric() || x == '-')
969}
970
971fn to_uppercase(s: &'_ str) -> impl Iterator<Item = char> + '_ {
972    s.chars().map(|c| c.to_ascii_uppercase())
973}
974
975// Beware: panics if s.len() == 0 (should never happen in our code)
976fn to_uppercase_first(s: &'_ str) -> impl Iterator<Item = char> + '_ {
977    let mut chars = s.chars();
978    once(chars.next().unwrap().to_ascii_uppercase()).chain(chars.map(|c| c.to_ascii_lowercase()))
979}
980
981fn to_lowercase(s: &'_ str) -> impl Iterator<Item = char> + '_ {
982    s.chars().map(|c| c.to_ascii_lowercase())
983}
984
985/// Errors returned by `LanguageTag` parsing
986#[derive(Clone, Debug, Eq, PartialEq)]
987pub enum ParseError {
988    /// If an extension subtag is present, it must not be empty.
989    EmptyExtension,
990    /// If the `x` subtag is present, it must not be empty.
991    EmptyPrivateUse,
992    /// The langtag contains a char that is not A-Z, a-z, 0-9 or the dash.
993    ForbiddenChar,
994    /// A subtag fails to parse, it does not match any other subtags.
995    InvalidSubtag,
996    /// The given language subtag is invalid.
997    InvalidLanguage,
998    /// A subtag may be eight characters in length at maximum.
999    SubtagTooLong,
1000    /// A subtag should not be empty.
1001    EmptySubtag,
1002    /// At maximum three extlangs are allowed, but zero to one extlangs are preferred.
1003    TooManyExtlangs,
1004}
1005
1006impl Error for ParseError {}
1007
1008impl fmt::Display for ParseError {
1009    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1010        f.write_str(match self {
1011            Self::EmptyExtension => "if an extension subtag is present, it must not be empty",
1012            Self::EmptyPrivateUse => "if the `x` subtag is present, it must not be empty",
1013            Self::ForbiddenChar => "the langtag contains a char not allowed",
1014            Self::InvalidSubtag => "a subtag fails to parse, it does not match any other subtags",
1015            Self::InvalidLanguage => "the given language subtag is invalid",
1016            Self::SubtagTooLong => "a subtag may be eight characters in length at maximum",
1017            Self::EmptySubtag => "a subtag should not be empty",
1018            Self::TooManyExtlangs => "at maximum three extlangs are allowed",
1019        })
1020    }
1021}
1022
1023/// Errors returned by the `LanguageTag` validation
1024#[derive(Clone, Debug, Eq, PartialEq)]
1025pub enum ValidationError {
1026    /// The same variant subtag is only allowed once in a tag.
1027    DuplicateVariant,
1028    /// The same extension subtag is only allowed once in a tag before the private use part.
1029    DuplicateExtension,
1030    /// only one extended language subtag is allowed
1031    MultipleExtendedLanguageSubtags,
1032    /// The primary language is not in the IANA Language Subtag Registry
1033    PrimaryLanguageNotInRegistry,
1034    /// The extended language is not in the IANA Language Subtag Registry
1035    ExtendedLanguageNotInRegistry,
1036    /// The script is not in the IANA Language Subtag Registry
1037    ScriptNotInRegistry,
1038    /// The region is not in the IANA Language Subtag Registry
1039    RegionNotInRegistry,
1040    /// A variant is not in the IANA Language Subtag Registry
1041    VariantNotInRegistry,
1042    /// The primary language is not the expected extended language prefix from the IANA Language Subtag Registry
1043    WrongExtendedLanguagePrefix,
1044    /// The language tag has not one of the expected variant prefix from the IANA Language Subtag Registry
1045    WrongVariantPrefix,
1046}
1047
1048impl Error for ValidationError {}
1049
1050impl fmt::Display for ValidationError {
1051    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1052        f.write_str(match self {
1053            Self::DuplicateVariant => {
1054                "the same variant subtag is only allowed once in a tag"
1055            }
1056            Self::DuplicateExtension => {
1057                "the same extension subtag is only allowed once in a tag"
1058            }
1059            Self::MultipleExtendedLanguageSubtags => {
1060                "only one extended language subtag is allowed"
1061            }
1062            Self::PrimaryLanguageNotInRegistry => {
1063                "the primary language is not in the IANA Language Subtag Registry"
1064            }
1065            Self::ExtendedLanguageNotInRegistry => {
1066                "the extended language is not in the IANA Language Subtag Registry"
1067            }
1068            Self::ScriptNotInRegistry => {
1069                "the script is not in the IANA Language Subtag Registry"
1070            }
1071            Self::RegionNotInRegistry => {
1072                "the region is not in the IANA Language Subtag Registry"
1073            }
1074            Self::VariantNotInRegistry => {
1075                "a variant is not in the IANA Language Subtag Registry"
1076            }
1077            Self::WrongExtendedLanguagePrefix => {
1078                "the primary language is not the expected extended language prefix from the IANA Language Subtag Registry"
1079            }
1080            Self::WrongVariantPrefix => {
1081                "the language tag has not one of the expected variant prefix from the IANA Language Subtag Registry"
1082            }
1083        })
1084    }
1085}
1086
1087fn between<T: Ord>(value: T, start: T, end: T) -> bool {
1088    start <= value && value <= end
1089}
1090
1091fn is_in_str_slice_set(slice: &[&'static str], value: &str) -> bool {
1092    slice.binary_search(&value).is_ok()
1093}
1094
1095fn is_in_from_str_slice_set<T: Copy + Ord + FromStr>(slice: &[T], value: &str) -> bool {
1096    match T::from_str(value) {
1097        Ok(key) => slice.binary_search(&key).is_ok(),
1098        Err(_) => false,
1099    }
1100}
1101
1102fn find_in_str_slice_map<'a, V>(slice: &'a [(&'static str, V)], value: &str) -> Option<&'a V> {
1103    if let Ok(position) = slice.binary_search_by_key(&value, |t| t.0) {
1104        Some(&slice[position].1)
1105    } else {
1106        None
1107    }
1108}
1109
1110fn find_in_from_str_slice_map<'a, K: Copy + Ord + FromStr, V: Deref<Target = str>>(
1111    slice: &'a [(K, V)],
1112    value: &str,
1113) -> Option<&'a str> {
1114    if let Ok(position) = slice.binary_search_by_key(&K::from_str(value).ok()?, |t| t.0) {
1115        Some(&*slice[position].1)
1116    } else {
1117        None
1118    }
1119}
language_tags/lib.rs

language_tags/
lib.rs