Skip to content

Sendable Config #189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ DerivedData/
.swiftpm/config/registries.json
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
.netrc
.idea
.idea
.index-build
*.out
5 changes: 3 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ let package = Package(
],
dependencies: [
.package(url: "https://github.com/apple/swift-argument-parser.git", .upToNextMinor(from: "1.4.0")),
.package(url: "https://github.com/apple/swift-collections.git", .upToNextMinor(from: "1.1.4")),
.package(url: "https://github.com/johnmai-dev/Jinja", .upToNextMinor(from: "1.1.0")),
],
targets: [
Expand All @@ -24,13 +25,13 @@ let package = Package(
]
),
.executableTarget(name: "HubCLI", dependencies: ["Hub", .product(name: "ArgumentParser", package: "swift-argument-parser")]),
.target(name: "Hub", resources: [.process("FallbackConfigs")]),
.target(name: "Hub", dependencies: [.product(name: "OrderedCollections", package: "swift-collections")], resources: [.process("FallbackConfigs")]),
.target(name: "Tokenizers", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]),
.target(name: "TensorUtils"),
.target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]),
.target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]),
.testTarget(name: "TokenizersTests", dependencies: ["Tokenizers", "Models", "Hub"], resources: [.process("Resources"), .process("Vocabs")]),
.testTarget(name: "HubTests", dependencies: ["Hub"]),
.testTarget(name: "HubTests", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]),
.testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]),
.testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils", "Models", "Hub"], resources: [.process("Resources")]),
.testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]),
Expand Down
242 changes: 242 additions & 0 deletions Sources/Hub/BinaryDistinct.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
//
// BinaryDistinct.swift
// swift-transformers
//
// Created by Piotr Kowalczuk on 06.03.25.
//

import Foundation

/// BinaryDistinctString helps to overcome limitations of both String and NSString types. Where the prior is performing unicode normalization and the following is not Sendable. For more reference [Modifying-and-Comparing-Strings](https://developer.apple.com/documentation/swift/string#Modifying-and-Comparing-Strings).
public struct BinaryDistinctString: Equatable, Hashable, Sendable, Comparable, CustomStringConvertible, ExpressibleByStringLiteral {
public let value: [UInt16]

public var nsString: NSString {
String(utf16CodeUnits: value, count: value.count) as NSString
}

public var string: String {
String(nsString)
}

public var count: Int {
string.count
}

/// Satisfies ``CustomStringConvertible`` protocol.
public var description: String {
string
}

public init(_ bytes: [UInt16]) {
value = bytes
}

public init(_ str: NSString) {
value = Array(str as String).flatMap { $0.utf16 }
}

public init(_ str: String) {
self.init(str as NSString)
}

public init(_ character: BinaryDistinctCharacter) {
value = character.bytes
}

public init(_ characters: [BinaryDistinctCharacter]) {
var data: [UInt16] = []
for character in characters {
data.append(contentsOf: character.bytes)
}
value = data
}

/// Satisfies ``ExpressibleByStringLiteral`` protocol.
public init(stringLiteral value: String) {
self.init(value)
}

public static func == (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> Bool {
lhs.value == rhs.value
}

public static func < (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> Bool {
lhs.value.lexicographicallyPrecedes(rhs.value)
}

public static func + (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> BinaryDistinctString {
BinaryDistinctString(lhs.value + rhs.value)
}

public func hasPrefix(_ prefix: BinaryDistinctString) -> Bool {
guard prefix.value.count <= value.count else { return false }
return value.starts(with: prefix.value)
}

public func hasSuffix(_ suffix: BinaryDistinctString) -> Bool {
guard suffix.value.count <= value.count else { return false }
return value.suffix(suffix.value.count) == suffix.value
}

public func lowercased() -> BinaryDistinctString {
.init(string.lowercased())
}

public func replacingOccurrences(of: Self, with: Self) -> BinaryDistinctString {
BinaryDistinctString(string.replacingOccurrences(of: of.string, with: with.string))
}
}

public extension BinaryDistinctString {
typealias Index = Int // Treat indices as integers

var startIndex: Index { 0 }
var endIndex: Index { count }

func index(_ i: Index, offsetBy distance: Int) -> Index {
let newIndex = i + distance
guard newIndex >= 0, newIndex <= count else {
fatalError("Index out of bounds")
}
return newIndex
}

func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? {
let newIndex = i + distance
return newIndex <= limit ? newIndex : nil
}
}

extension BinaryDistinctString: Sequence {
public func makeIterator() -> AnyIterator<BinaryDistinctCharacter> {
var iterator = string.makeIterator() // Use native Swift String iterator

return AnyIterator {
guard let char = iterator.next() else { return nil }
return BinaryDistinctCharacter(char)
}
}
}

public extension BinaryDistinctString {
subscript(bounds: PartialRangeFrom<Int>) -> BinaryDistinctString {
let validRange = bounds.lowerBound..<value.count // Convert to Range<Int>
return self[validRange]
}

/// Returns a slice of the `BinaryDistinctString` while ensuring correct rune (grapheme cluster) boundaries.
subscript(bounds: Range<Int>) -> BinaryDistinctString {
guard bounds.lowerBound >= 0, bounds.upperBound <= count else {
fatalError("Index out of bounds")
}

let utf8Bytes = value
var byteIndices: [Int] = []

// Decode UTF-8 manually to find rune start positions
var currentByteIndex = 0
for (index, scalar) in string.unicodeScalars.enumerated() {
if index == bounds.lowerBound {
byteIndices.append(currentByteIndex)
}
currentByteIndex += scalar.utf8.count
if index == bounds.upperBound - 1 {
byteIndices.append(currentByteIndex)
break
}
}

// Extract the byte range
let startByteIndex = byteIndices.first ?? 0
let endByteIndex = byteIndices.last ?? utf8Bytes.count

let slicedBytes = Array(utf8Bytes[startByteIndex..<endByteIndex])
return BinaryDistinctString(slicedBytes)
}
}

public extension Dictionary where Key == BinaryDistinctString {
/// Merges another `BinaryDistinctDictionary` into this one
mutating func merge(_ other: [BinaryDistinctString: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
merge(other, uniquingKeysWith: strategy)
}

/// Merges a `[String: Value]` dictionary into this one
mutating func merge(_ other: [String: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
let converted = Dictionary(uniqueKeysWithValues: other.map { (BinaryDistinctString($0.key), $0.value) })
merge(converted, uniquingKeysWith: strategy)
}

/// Merges a `[NSString: Value]` dictionary into this one
mutating func merge(_ other: [NSString: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
let converted = Dictionary(uniqueKeysWithValues: other.map { (BinaryDistinctString($0.key), $0.value) })
merge(converted, uniquingKeysWith: strategy)
}

func merging(_ other: [String: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
var newDict = self
newDict.merge(other, strategy: strategy)
return newDict
}

func merging(_ other: [BinaryDistinctString: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
var newDict = self
newDict.merge(other, strategy: strategy)
return newDict
}

func merging(_ other: [NSString: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
var newDict = self
newDict.merge(other, strategy: strategy)
return newDict
}
}

public protocol StringConvertible: ExpressibleByStringLiteral { }

extension BinaryDistinctString: StringConvertible { }
extension String: StringConvertible { }
extension NSString: StringConvertible { }

public struct BinaryDistinctCharacter: Equatable, Hashable, CustomStringConvertible, ExpressibleByStringLiteral {
let bytes: [UInt16]

public init(_ character: Character) {
bytes = Array(character.utf16)
}

public init(_ string: String) {
bytes = Array(string.utf16)
}

public init(_ nsString: NSString) {
let swiftString = nsString as String
bytes = Array(swiftString.utf16)
}

public init(bytes: [UInt16]) {
self.bytes = bytes
}

/// Satisfies ``ExpressibleByStringLiteral`` protocol.
public init(stringLiteral value: String) {
self.init(value)
}

var stringValue: String? {
String(utf16CodeUnits: bytes, count: bytes.count)
}

public var description: String {
if let str = stringValue {
"BinaryDistinctCharacter('\(str)', bytes: \(bytes.map { String(format: "0x%02X", $0) }))"
} else {
"BinaryDistinctCharacter(invalid UTF-8, bytes: \(bytes.map { String(format: "0x%02X", $0) }))"
}
}

public static func == (lhs: BinaryDistinctCharacter, rhs: BinaryDistinctCharacter) -> Bool {
lhs.bytes == rhs.bytes
}
}
Loading
Loading