{"id":9812,"date":"2024-12-17T18:38:58","date_gmt":"2024-12-17T10:38:58","guid":{"rendered":"http:\/\/www.yliyun.com\/?p=9812"},"modified":"2024-12-17T18:38:58","modified_gmt":"2024-12-17T10:38:58","slug":"python-%e8%af%ad%e8%a8%80%e6%a3%80%e6%b5%8b","status":"publish","type":"post","link":"http:\/\/www.yliyun.com\/2024\/12\/17\/python-%e8%af%ad%e8%a8%80%e6%a3%80%e6%b5%8b\/","title":{"rendered":"Python \u8bed\u8a00\u68c0\u6d4b"},"content":{"rendered":"\n

Python \u4e2d\u6709\u591a\u79cd\u4f18\u79c0\u7684\u8bed\u8a00\u8bc6\u522b\u5de5\u5177\uff0c\u4ee5\u4e0b\u662f\u4e00\u4e9b\u5e38\u7528\u7684\u5de5\u5177\u548c\u5e93\uff1a<\/p>\n\n\n\n

1. langdetect<\/strong><\/p>\n\n\n\n

\u2022 \u7b80\u4ecb<\/strong>: langdetect \u662f\u4e00\u4e2a\u975e\u5e38\u6d41\u884c\u7684\u8bed\u8a00\u68c0\u6d4b\u5e93\uff0c\u57fa\u4e8e Google \u7684 language-detection \u9879\u76ee\u3002\u5b83\u53ef\u4ee5\u68c0\u6d4b\u591a\u79cd\u8bed\u8a00\uff0c\u5e76\u4e14\u5bf9\u4e8e\u77ed\u6587\u672c\u4e5f\u6709\u4e0d\u9519\u7684\u8bc6\u522b\u6548\u679c\u3002<\/p>\n\n\n\n

\u2022 \u5b89\u88c5<\/strong>:<\/p>\n\n\n\n

pip install langdetect<\/p>\n\n\n\n

\u2022 \u4f7f\u7528\u793a\u4f8b<\/strong>:<\/p>\n\n\n\n

from langdetect import detect<\/p>\n\n\n\n

text = “Bonjour tout le monde”<\/p>\n\n\n\n

language = detect(text)<\/p>\n\n\n\n

print(language)  # \u8f93\u51fa: ‘fr’ (\u6cd5\u8bed)<\/em><\/p>\n\n\n\n

2. langid<\/strong><\/p>\n\n\n\n

\u2022 \u7b80\u4ecb<\/strong>: langid \u662f\u53e6\u4e00\u4e2a\u975e\u5e38\u5f3a\u5927\u7684\u8bed\u8a00\u8bc6\u522b\u5e93\uff0c\u652f\u630197\u79cd\u8bed\u8a00\u3002\u5b83\u7684\u7279\u70b9\u662f\u5b8c\u5168\u81ea\u5305\u542b\u4e14\u65e0\u9700\u5916\u90e8\u4f9d\u8d56\u3002<\/p>\n\n\n\n

\u2022 \u5b89\u88c5<\/strong>:<\/p>\n\n\n\n

pip install langid<\/p>\n\n\n\n

\u2022 \u4f7f\u7528\u793a\u4f8b<\/strong>:<\/p>\n\n\n\n

import langid<\/p>\n\n\n\n

text = “Hola, \u00bfc\u00f3mo est\u00e1s?”<\/p>\n\n\n\n

language, _ = langid.classify(text)<\/p>\n\n\n\n

print(language)  # \u8f93\u51fa: ‘es’ (\u897f\u73ed\u7259\u8bed)<\/em><\/p>\n\n\n\n

3. polyglot<\/strong><\/p>\n\n\n\n

\u2022 \u7b80\u4ecb<\/strong>: polyglot \u662f\u4e00\u4e2a\u652f\u6301\u591a\u8bed\u8a00\u5904\u7406\u7684\u5e93\uff0c\u5b83\u4e0d\u4ec5\u63d0\u4f9b\u8bed\u8a00\u8bc6\u522b\u529f\u80fd\uff0c\u8fd8\u652f\u6301\u60c5\u611f\u5206\u6790\u3001\u5b9e\u4f53\u8bc6\u522b\u7b49\u591a\u79cd\u81ea\u7136\u8bed\u8a00\u5904\u7406\u4efb\u52a1\u3002<\/p>\n\n\n\n

\u2022 \u5b89\u88c5<\/strong>:<\/p>\n\n\n\n

pip install polyglot<\/p>\n\n\n\n

\u2022 \u4f7f\u7528\u793a\u4f8b<\/strong>:<\/p>\n\n\n\n

from polyglot.detect import Detector<\/p>\n\n\n\n

text = “Ceci est un exemple de texte en fran\u00e7ais”<\/p>\n\n\n\n

detector = Detector(text)<\/p>\n\n\n\n

language = detector.language.code<\/p>\n\n\n\n

print(language)  # \u8f93\u51fa: ‘fr’ (\u6cd5\u8bed)<\/em><\/p>\n\n\n\n

4. TextBlob<\/strong><\/p>\n\n\n\n

\u2022 \u7b80\u4ecb<\/strong>: TextBlob \u662f\u4e00\u4e2a\u7b80\u6d01\u6613\u7528\u7684\u81ea\u7136\u8bed\u8a00\u5904\u7406\u5de5\u5177\u5305\uff0c\u867d\u7136\u5b83\u4e3b\u8981\u7528\u4e8e\u60c5\u611f\u5206\u6790\u3001\u8bcd\u6027\u6807\u6ce8\u7b49\u4efb\u52a1\uff0c\u4f46\u4e5f\u652f\u6301\u8bed\u8a00\u8bc6\u522b\u3002<\/p>\n\n\n\n

\u2022 \u5b89\u88c5<\/strong>:<\/p>\n\n\n\n

pip install textblob<\/p>\n\n\n\n

\u2022 \u4f7f\u7528\u793a\u4f8b<\/strong>:<\/p>\n\n\n\n

from textblob import TextBlob<\/p>\n\n\n\n

text = “Hello, how are you?”<\/p>\n\n\n\n

blob = TextBlob(text)<\/p>\n\n\n\n

print(blob.detect_language())  # \u8f93\u51fa: ‘en’ (\u82f1\u8bed)<\/em><\/p>\n\n\n\n

\"\"<\/figure>\n\n\n\n

5. FastText (by Facebook)<\/strong><\/p>\n\n\n\n

\u2022 \u7b80\u4ecb<\/strong>: FastText \u662f\u4e00\u4e2a\u7531 Facebook \u63d0\u4f9b\u7684\u5f00\u6e90\u5e93\uff0c\u9664\u4e86\u9ad8\u6548\u7684\u8bcd\u5411\u91cf\u8868\u793a\u5916\uff0c\u5b83\u4e5f\u80fd\u5f88\u597d\u5730\u8fdb\u884c\u8bed\u8a00\u8bc6\u522b\u3002\u5b83\u652f\u6301\u591a\u8fbe170\u591a\u79cd\u8bed\u8a00\u3002<\/p>\n\n\n\n

\u2022 \u5b89\u88c5<\/strong>:<\/p>\n\n\n\n

pip install fasttext<\/p>\n\n\n\n

\u2022 \u4f7f\u7528\u793a\u4f8b<\/strong>:<\/p>\n\n\n\n

import fasttext<\/p>\n\n\n\n

model = fasttext.load_model(‘lid.176.bin’)  # \u4e0b\u8f7d\u9884\u8bad\u7ec3\u6a21\u578b<\/em><\/p>\n\n\n\n

text = “Ceci est un texte en fran\u00e7ais”<\/p>\n\n\n\n

prediction = model.predict(text)<\/p>\n\n\n\n

print(prediction)  # \u8f93\u51fa: (‘__label__fr’,)<\/em><\/p>\n\n\n\n

6. cld3 (Compact Language Detector v3)<\/strong><\/p>\n\n\n\n

\u2022 \u7b80\u4ecb<\/strong>: cld3 \u662f\u4e00\u4e2a\u9ad8\u6548\u7684\u8bed\u8a00\u68c0\u6d4b\u5e93\uff0c\u57fa\u4e8e Google \u7684 Compact Language Detector v3\u3002\u5b83\u5bf9\u77ed\u6587\u672c\u548c\u591a\u8bed\u8a00\u6587\u672c\u90fd\u6709\u4e0d\u9519\u7684\u652f\u6301\u3002<\/p>\n\n\n\n

\u2022 \u5b89\u88c5<\/strong>:<\/p>\n\n\n\n

pip install cld3<\/p>\n\n\n\n

\u2022 \u4f7f\u7528\u793a\u4f8b<\/strong>:<\/p>\n\n\n\n

import cld3<\/p>\n\n\n\n

text = “Hola, \u00bfc\u00f3mo est\u00e1s?”<\/p>\n\n\n\n

language = cld3.get_language(text)<\/p>\n\n\n\n

print(language)  # \u8f93\u51fa: Language: es (\u897f\u73ed\u7259\u8bed)<\/em><\/p>\n\n\n\n

\u603b\u7ed3\uff1a<\/strong><\/p>\n\n\n\n

\u2022 \u5982\u679c\u9700\u8981\u4e00\u4e2a\u7b80\u5355\u3001\u6613\u7528\u7684\u5de5\u5177\uff0clangdetect \u548c langid \u90fd\u662f\u4e0d\u9519\u7684\u9009\u62e9\u3002<\/p>\n\n\n\n

\u2022 \u5982\u679c\u5bf9\u5904\u7406\u591a\u8bed\u8a00\u7684\u6587\u672c\u548c\u9700\u8981\u5176\u4ed6 NLP \u529f\u80fd\u6709\u9700\u6c42\uff0c\u53ef\u4ee5\u8003\u8651\u4f7f\u7528 polyglot \u6216 TextBlob\u3002<\/p>\n\n\n\n

\u2022 \u5982\u679c\u9700\u8981\u66f4\u9ad8\u7cbe\u5ea6\u7684\u68c0\u6d4b\uff0c\u5c24\u5176\u662f\u5728\u77ed\u6587\u672c\u7684\u60c5\u51b5\u4e0b\uff0cFastText \u548c cld3 \u662f\u66f4\u5f3a\u5927\u7684\u9009\u62e9\u3002<\/p>\n\n\n\n

\u4f60\u53ef\u4ee5\u6839\u636e\u5177\u4f53\u9700\u6c42\u9009\u62e9\u9002\u5408\u7684\u5de5\u5177\uff01<\/p>\n","protected":false},"excerpt":{"rendered":"

Python \u4e2d\u6709\u591a\u79cd\u4f18\u79c0\u7684\u8bed\u8a00\u8bc6\u522b\u5de5\u5177\uff0c\u4ee5\u4e0b\u662f\u4e00 […]<\/p>\n","protected":false},"author":1,"featured_media":9813,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[84,86],"tags":[79,95,94],"class_list":["post-9812","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-rag","category-86","tag-rag","tag-95","tag-94"],"_links":{"self":[{"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/posts\/9812","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/comments?post=9812"}],"version-history":[{"count":1,"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/posts\/9812\/revisions"}],"predecessor-version":[{"id":9814,"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/posts\/9812\/revisions\/9814"}],"wp:featuredmedia":[{"embeddable":true,"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/media\/9813"}],"wp:attachment":[{"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/media?parent=9812"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/categories?post=9812"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.yliyun.com\/wp-json\/wp\/v2\/tags?post=9812"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}