arranged folders

master
bootje 5 years ago
parent f8b93da0aa
commit 0be2ec1812

BIN
.DS_Store vendored

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

BIN
nltk-book/.DS_Store vendored

Binary file not shown.

@ -1 +0,0 @@
venv/

@ -1,10 +0,0 @@
import nltk
file=open('faceapp.txt','r')
raw=file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
faceapp.concordance('services')

@ -1,39 +0,0 @@
import sys
import codecs
import nltk
from nltk.corpus import stopwords
# NLTK's default English stopwords
default_stopwords = set(nltk.corpus.stopwords.words('english'))
#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
file = open('faceapp.txt','r')
raw = file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
faceapp.concordance('services')
# Remove single-character tokens (mostly punctuation)
tokens = [word for word in tokens if len(word) > 1]
# Remove numbers
tokens = [word for word in tokens if not word.isnumeric()]
# Lowercase all words (default_stopwords are lowercase too)
tokens = [word.lower() for word in tokens]
# Remove stopwords
tokens = [word for word in tokens if word not in all_stopwords]
# Calculate frequency distribution
fdist = nltk.FreqDist(tokens)
# Output top 50 words
for word, frequency in fdist.most_common(10):
print(u'{};{}'.format(word, frequency))

@ -1,32 +0,0 @@
import sys
import codecs
import nltk
import json
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
#read stop words from a file (one stopword per line, UTF-8)
stopwords_file = './stopwords.txt'
custom_stopwords = set(codecs.open('stopwords.txt', 'r', 'utf-8').read().splitlines())
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
raw = file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
# Remove single-character tokens (mostly punctuation)
tokens = [word for word in tokens if len(word) > 1]
# Remove numbers
tokens = [word for word in tokens if not word.isnumeric()]
# Lowercase all words (default_stopwords are lowercase too)
tokens = [word.lower() for word in tokens]
# pos_tag = [word_tokenize(sent) for sent in sent_tokenize(raw)]
pos_tag = [pos_tag(word_tokenize(sent))for sent in sent_tokenize(raw)]
print(pos_tag)
with open ('colonial-glossary.json', 'w') as json_file:
json.dump(pos_tag, json_file)

@ -1,30 +0,0 @@
import nltk
file=open('faceapp.txt','r')
raw=file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
# my stopwords are common words I don't want to count, like "a", "an", "the".
stopwords = set(line.strip() for line in open('stopwords.txt'))
# dictionary
wordcount = {}
# spliting words from punctuation so "book" and "book!" counts as the same word
for word in raw.lower().split():
word = word.replace(".","")
word = word.replace(",","")
word = word.replace(":","")
word = word.replace("\"","")
word = word.replace("!","")
word = word.replace("“","")
word = word.replace("‘","")
word = word.replace("*","")
word = word.replace("(","")
word = word.replace(")","")
faceapp.concordance('a')

File diff suppressed because one or more lines are too long

@ -1,58 +0,0 @@
EPISTEMIC = "epistemic" # Expresses degree of coloniality.
# gradation of intensity words
# 100.00 = absolute level of coloniality
# 90.00 = extreme level of coloniality
# 80.00 = heavy level of coloniality
# 70.00 = high level of coloniality
# 60.00 = significant level of coloniality
# 50.00 =
# 40.00 = relative level of coloniality
# 30.00 = moderate level of coloniality
# 20.00 = reasonable level of coloniality
# 10.00 = fair level of coloniality
# 0.00 = neutral level of coloniality
# lists of part of speech
#MD = would, could...
#RB = adverb 'very', 'slightly'...
#VB = verb
#JJ = adjective 'big'...
#NN = noun
#CC = coordinating conjunction 'and', 'or'...
#PRP = personal pronoun 'I', 'he', 'she'...
epistemic_MD = { # would => could => can => should => shall => will => must
100.00: d("have", "has", "must", "need"),
90.00: d("have", "has", "must", "need"),
80.00: d("can", "ca", "may"),
70.00: d(),
60.00: d(),
50.00: d("shall", "sha"),
40.00: d("will", "'ll", "wo"),
30.00: d(),
20.00: d("can", "ca", "may"),
10.00: d("could", "dare", "might"),
0.00: d("would"),
}
epistemic_VB = { #verbs from FaceApp ToS
100.00: d("must", "agree","use"),
90.00: d("use", "bound", "access", "allow", "acknowlegde", "reproduce"),
80.00: d("choose","claim", "permit", "collect" ),
70.00: d("change", ),
60.00: d("create"),
50.00: d(),
40.00: d("maintain"),
30.00: d("support"),
20.00: d("identify"),
10.00: d("may"),
0.00: d(),
}

@ -1,239 +0,0 @@
Terms of Service
1. Eligibility
You must be at least 13 years of age to access or use our Services. If you are under 18 years of age (or the age of legal majority where you live), you may only access or use our Services under the supervision of a parent or legal guardian who agrees to be bound by this Agreement. If you are a parent or legal guardian of a user under the age of 18 (or the age of legal majority), you agree to be fully responsible for the acts or omissions of such user in connection with our Services. If you are accessing or using our Services on behalf of another person or entity, you represent that you are authorized to accept this Agreement on that person or entitys behalf and that the person or entity agrees to be responsible to us if you or the other person or entity violates this Agreement.
2. User Accounts and Account Security
If you choose to login to the Services via a third-party platform or social media network, you will need to use your credentials (e.g., username and password) from a third-party online platform. You must maintain the security of your third party account and promptly notify us if you discover or suspect that someone has accessed your account without your permission. If you permit others to use your account credentials, you are responsible for the activities of such users that occur in connection with your account.
3. Privacy
Please refer to our Privacy Policy for information about how we collect, use and disclose information about you.
4. User Content
Our Services may allow you and other users to create, post, store and share content, including photos, videos, messages, text, software and other materials (collectively, “User Content”). User Content does not include user-generated filters. Subject to this Agreement and the Privacy Policy, you retain all rights in and to your User Content, as between you and FaceApp. Further, FaceApp does not claim ownership of any User Content that you post on or through the Services. You grant FaceApp a nonexclusive, royalty-free, worldwide, fully paid license to use, reproduce, modify, adapt, create derivative works from, distribute, perform and display your User Content during the term of this Agreement solely to provide you with the Services.
You acknowledge that some of the Services are supported by advertising revenue and may display advertisements and promotions, and you hereby agree that FaceApp may place such advertising and promotions on the Services or on, about, or in conjunction with your User Content. The manner, mode and extent of such advertising and promotions are subject to change without specific notice to you. You acknowledge that we may not always identify paid services, sponsored content, or commercial communications as such.
You represent and warrant that: (i) you own or otherwise have the right to use the User Content modified by you on or through the Services in accordance with the rights and licenses set forth in this Agreement; (ii) you agree to pay for all royalties, fees, and any other monies owed by reason of User Content you stylize on or through the Services; and (iii) you have the legal right and capacity to enter into this Agreement in your jurisdiction.
You may not create, post, store or share any User Content that violates this Agreement or for which you do not have all the rights necessary to grant us the license described above. Although we have no obligation to screen, edit or monitor User Content, we may delete or remove User Content at any time and for any reason.
FaceApp is not a backup service and you agree that you will not rely on the Services for the purposes of User Content backup or storage. FaceApp will not be liable to you for any modification, suspension, or discontinuation of the Services, or the loss of any User Content.
5. Prohibited Conduct and Content
You will not violate any applicable law, contract, intellectual property or other third-party right or commit a tort, and you are solely responsible for your conduct while accessing or using our Services. You will not:
Engage in any harassing, threatening, intimidating, predatory or stalking conduct;
Use or attempt to use another users account without authorization from that user and FaceApp;
Use our Services in any manner that could interfere with, disrupt, negatively affect or inhibit other users from fully enjoying our Services or that could damage, disable, overburden or impair the functioning of our Services in any manner;
Reverse engineer any aspect of our Services or do anything that might discover source code or bypass or circumvent measures employed to prevent or limit access to any part of our Services;
Attempt to circumvent any content-filtering techniques we employ or attempt to access any feature or area of our Services that you are not authorized to access;
Develop or use any third-party applications that interact with our Services without our prior written consent, including any scripts designed to scrape or extract data from our Services;
Use our Services for any illegal or unauthorized purpose, or engage in, encourage or promote any activity that violates this Agreement.
You may also only post or otherwise share User Content that is non-confidential and you have all necessary rights to disclose. You may not create, post, store or share any User Content that:
Is unlawful, libelous, defamatory, obscene, pornographic, indecent, lewd, suggestive, harassing, threatening, invasive of privacy or publicity rights, abusive, inflammatory or fraudulent;
Would constitute, encourage or provide instructions for a criminal offense, violate the rights of any party or otherwise create liability or violate any local, state, national or international law;
May infringe any patent, trademark, trade secret, copyright or other intellectual or proprietary right of any party;
Contains or depicts any statements, remarks or claims that do not reflect your honest views and experiences;
Impersonates, or misrepresents your affiliation with, any person or entity;
Contains any unsolicited promotions, political campaigning, advertising or solicitations;
Contains any private or personal information of a third party without such third partys consent;
Contains any viruses, corrupted data or other harmful, disruptive or destructive files or content; or
Is, in our sole judgment, objectionable or that restricts or inhibits any other person from using or enjoying our Services, or that may expose FaceApp or others to any harm or liability of any type.
In addition, although we have no obligation to screen, edit or monitor User Content, we may delete or remove User Content at any time and for any reason.
6. Limited License; Copyright and Trademark
Our Services and the text, graphics, images, photographs, videos, illustrations, trademarks, trade names, page headers, button icons, scripts, service marks, logos, slogans, filters, user generated filters and other content contained therein (collectively, the “FaceApp Content”) are owned by or licensed to FaceApp and are protected under both United States and foreign laws. Except as explicitly stated in this Agreement, FaceApp and our licensors reserve all rights in and to our Services and the FaceApp Content. You are hereby granted a limited, nonexclusive, nontransferable, non-sublicensable, revocable license to access and use our Services and FaceApp Content for your own personal use; however, such license is subject to this Agreement and does not include any right to: (a) sell, resell or commercially use our Services or FaceApp Content; (b) copy, reproduce, distribute, publicly perform or publicly display FaceApp Content, except as expressly permitted by us or our licensors; (c) modify the FaceApp Content, remove any proprietary rights notices or markings, or otherwise make any derivative uses of our Services or FaceApp Content, except as expressly set forth in this Agreement; (d) use any data mining, robots or similar data gathering or extraction methods; or (e) use our Services or FaceApp Content other than as expressly provided in this Agreement. Any use of our Services or FaceApp Content other than as specifically authorized herein, without our prior written permission, is strictly prohibited and will terminate the license granted under this Agreement. You will not remove, alter or conceal any copyright, trademark, service mark or other proprietary rights notices incorporated in or accompanying the FaceApp Content.
7. Feedback
Any questions, comments, suggestions, ideas, original or creative materials or other information you submit about FaceApp or our products or Services (collectively, “Feedback”), is non-confidential and we have no obligations (including without limitation obligations of confidentiality) with respect to such Feedback.
You hereby grant to FaceApp a fully paid, royalty-free, perpetual, irrevocable, worldwide, non-exclusive, and fully sublicensable right and license to use, reproduce, perform, display, distribute, adapt, modify, re-format, create derivative works of, and otherwise commercially or non-commercially exploit in any manner, any and all Feedback, and to sublicense the foregoing rights, in connection with the operation and maintenance of the Services and/or FaceApps business.
8. Copyright Complaints
We have a policy of limiting access to our Services and terminating the accounts of users who repeatedly infringe the intellectual property copyright rights of others upon prompt notification to us by the copyright owner or the copyright owners legal agent. Without limiting the foregoing, if you believe that your work has been copied and posted on or through the Services in a way that constitutes copyright infringement, please provide our Copyright Agent with the following information: (a) an electronic or physical signature of the person authorized to act on behalf of the owner of the copyright interest; (b) a description of the copyrighted work that you claim has been infringed; (c) a description of the location on the Services of the material that you claim is infringing; (d) your address, telephone number and e-mail address; € a written statement by you that you have a good faith belief that the disputed use is not authorized by the copyright owner, its agent or the law; and (f) a statement by you, made under penalty of perjury, that the above information in your notice is accurate and that you are the copyright owner or authorized to act on the copyright owners behalf. Contact information for FaceApps Copyright Agent for notice of claims of infringement is as follows: Yaroslav Goncharov, Designated DMCA Copyright Agent, FaceApp Inc, 1000 N West Street, Suite 1200, Wilmington, Delaware, 19801.
9. Indemnification
To the fullest extent permitted by applicable law, you will indemnify, defend, and hold harmless FaceApp and each of our respective officers, directors, agents, partners and employees (individually and collectively, the “FaceApp Parties”) from and against any loss, liability, claim, demand, damages, expenses or costs (“Claims”) arising out of or related to (a) your access to or use of our Services; (b) your User Content or Feedback; (c) your violation of this Agreement; (d) your violation, misappropriation or infringement of any rights of another (including intellectual property rights or privacy rights); or (e) your conduct in connection with our Services. You agree to promptly notify FaceApp Parties of any third party Claims, cooperate with FaceApp Parties in defending such Claims and pay all fees, costs and expenses associated with defending such Claims (including, but not limited to, attorneys fees). You also agree that the FaceApp Parties will have control of the defense or settlement of any third party Claims. This indemnity is in addition to, and not in lieu of, any other indemnities set forth in a written agreement between you and FaceApp or the other FaceApp Parties.
10. Disclaimers
We do not control, endorse or take responsibility for any User Content or third-party content available on or linked to by our Services.
YOUR USE OF OUR SERVICES IS AT YOUR SOLE RISK. OUR SERVICES ARE PROVIDED “AS IS” AND “AS AVAILABLE” WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, AND NON-INFRINGEMENT. In addition, FaceApp does not represent or warrant that our Services are accurate, complete, reliable, current or error-free. While FaceApp attempts to make your access to and use of our Services safe, we cannot and do not represent or warrant that our Services or servers are free of viruses or other harmful components. You assume the entire risk as to the quality and performance of the Services.
11. Limitation of Liability
FACEAPP AND THE OTHER FACEAPP PARTIES WILL NOT BE LIABLE TO YOU UNDER ANY THEORY OF LIABILITY—WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE—FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF FACEAPP OR THE OTHER FACEAPP PARTIES HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
THE TOTAL LIABILITY OF FACEAPP AND THE OTHER FACEAPP PARTIES, FOR ANY CLAIM ARISING OUT OF OR RELATING TO THIS AGREEMENT OR OUR SERVICES, REGARDLESS OF THE FORM OF THE ACTION, IS LIMITED TO THE AMOUNT PAID, IF ANY, BY YOU TO ACCESS OR USE OUR SERVICES.
The limitations set forth in this section will not limit or exclude liability for the gross negligence, fraud or intentional misconduct of FaceApp or the other FaceApp Parties or for any other matters in which liability cannot be excluded or limited under applicable law. Additionally, some jurisdictions do not allow the exclusion or limitation of incidental or consequential damages, so the above limitations or exclusions may not apply to you.
12. Release
To the fullest extent permitted by applicable law, you release FaceApp and the other FaceApp Parties from responsibility, liability, claims, demands, and/or damages (actual and consequential) of every kind and nature, known and unknown (including, but not limited to, claims of negligence), arising out of or related to disputes between users and the acts or omissions of third parties. You expressly waive any rights you may have under California Civil Code § 1542 as well as any other statute or common law principles that would otherwise limit the coverage of this release to include only those claims which you may know or suspect to exist in your favor at the time of agreeing to this release.
13. Transfer and Processing Data
By accessing or using our Services, you acknowledge and, as applicable, consent to the processing, transfer and storage of information about you in and to the United States and other countries.
14. Dispute Resolution; Binding Arbitration Agreement
Please read the following section carefully because it requires users who are U.S. residents to arbitrate certain disputes and claims with FaceApp and limits the manner in which you can seek relief from us.
Applicability of Arbitration Agreement. Except for small claims disputes in which you or FaceApp seek to bring an individual action in small claims court located in the county of your billing address or disputes in which you or FaceApp seeks injunctive or other equitable relief for the alleged unlawful use of intellectual property, you and FaceApp waive your rights to a jury trial and to have any dispute arising out of or related to this Agreement or our Services resolved in court. This Arbitration Agreement shall apply, without limitation, to all disputes or claims and requests for relief that arose or were asserted before the effective date of this Agreement or any prior version of this Agreement.
Arbitration Rules and Forum. The Federal Arbitration Act governs the interpretation and enforcement of this Arbitration Agreement. To begin an arbitration proceeding, you must send a letter requesting arbitration and describing your dispute or claim or request for relief to our registered agent [include name and address of registered agent here]. The arbitration will be resolved through confidential binding arbitration by the Judicial Arbitration and Mediation Services (“JAMS”), an established alternative dispute resolution provider. Disputes involving claims, counterclaims, or requests for relief under $250,000, not inclusive of attorneys fees and interest, shall be subject to JAMSs most current version of the Streamlined Arbitration Rules and procedures available; all other disputes shall be subject to JAMSs most current version of the Comprehensive Arbitration Rules and Procedures, available at http://www.jamsadr.com/rules-comprehensive-arbitration/. JAMSs rules are also available at www.jamsadr.com or by calling JAMS at 800-352-5267. If JAMS is not available to arbitrate, the parties will select an alternative arbitral forum. If the arbitrator finds that you cannot afford to pay JAMSs filing, administrative, hearing and/or other fees and cannot obtain a waiver from JAMS, FaceApp will pay them for you. In addition, we will reimburse all such JAMSs filing, administrative, hearing and/or other fees for disputes, claims, or requests for relief totaling less than $10,000 unless the arbitrator determines the claims are frivolous. You may choose to have the arbitration conduced by telephone, based on written submissions, or in person in the country where you live or at another mutually agreed location. Any judgment on the award rendered by the arbitrator may be entered in any court of competent jurisdiction.
You may choose to have the arbitration conduced by telephone, based on written submissions or at another mutually agreed location. Any judgment on the award rendered by the arbitrator may be entered in any court of competent jurisdiction.
Authority of Arbitrator. The arbitrator shall have exclusive authority to (a) determine the scope and enforceability of this Arbitration Agreement and (b) resolve any dispute related to the interpretation, applicability, enforceability or formation of this Arbitration Agreement, including, but not limited to, any assertion that all or any part of this Arbitration Agreement is void or voidable. The arbitration will decide the rights and liabilities, if any, of you and FaceApp. The arbitration proceeding will not be consolidated with any other matters or joined with any other cases or parties. The arbitrator shall have the authority to grant motions dispositive of all or part of any claim. The arbitrator shall have the authority to award monetary damages and to grant any non-monetary remedy or relief available to an individual under applicable law, the arbitral forums rules, and the Agreement (including the Arbitration Agreement). The arbitrator shall issue a written award and statement of decision describing the essential findings and conclusions on which the award is based, including the calculation of any damages awarded. The arbitrator has the same authority to award relief on an individual basis that a judge in a court of law would have. The award of the arbitrator is final and binding upon you and us.
Waiver of Jury Trial. YOU AND FACEAPP HEREBY WAIVE ANY CONSTITUTIONAL AND STATUTORY RIGHTS TO SUE IN COURT AND HAVE A TRIAL IN FRONT OF A JUDGE OR A JURY. You and FaceApp are instead electing that all disputes, claims or requests for relief shall be resolved by arbitration under this Arbitration Agreement, except as specified above. An arbitrator can award on an individual basis the same damages and relief as a court and must follow this Agreement as a court would. However, there is no judge or jury in arbitration, and court review of an arbitration award is subject to very limited. Review.
Waiver of Class or Other Non-Individualized Relief. ALL DISPUTES, CLAIMS AND REQUESTS FOR RELIEF WITHIN THE SCOPE OF THIS ARBITRATION AGREEMENT MUST BE ARBITRATED ON AN INDIVIDUAL BASIS AND NOT ON A CLASS OR COLLECTIVE BASIS. ONLY INDIVIDUAL RELIEF IS AVAILABLE, AND CLAIMS OF MORE THAN ONE USER CANNOT BE ARBITRATED OR CONSOLIDATED WITH THOSE OF ANY OTHER USER. If a decision is issued stating that applicable law precludes enforcement of any of this subsections limitations as to a given dispute, claim or request for relief, then such aspect must be severed from the arbitration and brought into the State or Federal Courts located in the State of California. All other disputes, claims, or requests for relief shall be arbitrated.
30-Day Right to Opt-Out. You have the right to opt out of the provisions of this Arbitration Agreement by sending written notice of your decision to opt-out to: arbitration@faceapp.com, within 30 days after first becoming subject to this Arbitration Agreement. Your notice must include your name and address, your username (if any), the e-mail address you used to set up your account (if you have one), and an unequivocal statement that you want to opt out of this Arbitration Agreement. If you opt out of this Arbitration Agreement, all other parts of this Agreement will continue to apply to you. Opting out of this Arbitration Agreement has no effect on any other arbitration agreements that you may currently have, or may enter in the future, with us.
You and FaceApp agree that the state or federal courts of the State of California and the United States sitting in Santa Clara County, California have exclusive jurisdiction over any appeals and the enforcement of an arbitration award.
Severability. Except as provided in this Section 14 above, if any part or parts of this Arbitration Agreement are found under the law to be invalid or unenforceable, then such specific part or parts shall be of no force and effect and shall be severed, and the remainder of the Arbitration Agreement shall continue in full force and effect.
Survival of Agreement. This Arbitration Agreement will survive the termination of your relationship with FaceApp.
Modification, Notwithstanding any provision in this Agreement to the contrary, we agree that if FaceApp makes any future material change to this Arbitration Agreement you may reject that change within thirty (30) days of such change becoming effective by writing Company at the following address: arbitration@faceapp.com.
15. Governing Law and Venue
This Agreement and your access to and use of our Services will be governed by and construed and enforced in accordance with the laws of California, consistent with the Federal Arbitration Act, without regard to conflict of law rules or principles (whether of California or any other jurisdiction) that would cause the application of the laws of any other jurisdiction. The United Nations Convention for the International Sale of Goods does not apply to the Agreement. Any dispute between the parties that is not subject to arbitration or cannot be heard in small claims court will be resolved in the state or federal courts of California and the United States, respectively, sitting in Santa Clara County, California.
16. Electronic Communications
By accessing or using the Services, you also consent to receive electronic communications from FaceApp (e.g., responses to your requests, questions and feedback, announcements, updates, and security alerts through a push notification or by posting notices on our Services). You agree that any notices, agreements, disclosures or other communications that we send to you electronically will satisfy any legal communication requirements, including, but not limited to, that such communications be in writing.
17. Termination
We reserve the right, without notice and in our sole discretion, to terminate your right to access or use our Services. We are not responsible for any loss or harm related to your inability to access or use our Services.
18. Severability
If any provision or part of a provision of this Agreement is unlawful, void or unenforceable, that provision or part of the provision is deemed severable from this Agreement and does not affect the validity and enforceability of any remaining provisions.
19. Additional Terms Applicable to iOS Devices
The following terms apply if you install, access or use the Services on any device that contains the iOS mobile operating system (the “App”) developed by Apple Inc. (“Apple”).
Acknowledgement. You acknowledge that this Agreement is concluded solely between us, and not with Apple, and FaceApp, not Apple, is solely responsible for the App and the content thereof. You further acknowledge that the usage rules for the App are subject to any additional restrictions set forth in the Usage Rules for the Apple App Store Terms of Service as of the date you download the App, and in the event of any conflict, the Usage Rules in the App Store shall govern if they are more restrictive. You acknowledge and agree that you have had the opportunity to review the Usage Rules.
Scope of License. The license granted to you is limited to a non-transferable license to use the App on any iPhone, iPod touch or iPad that you own or control as permitted by the Usage Rules set forth in the Apple App Store Terms of Service.
Maintenance and Support. You and FaceApp acknowledge that Apple has no obligation whatsoever to furnish any maintenance and support services with respect to the App.
Warranty. You acknowledge that Apple is not responsible for any product warranties, whether express or implied by law, with respect to the App. In the event of any failure of the App to conform to any applicable warranty, you may notify Apple, and Apple will refund the purchase price, if any, paid to Apple for the App by you; and to the maximum extent permitted by applicable law, Apple will have no other warranty obligation whatsoever with respect to the App. The parties acknowledge that to the extent that there are any applicable warranties, any other claims, losses, liabilities, damages, costs or expenses attributable to any failure to conform to any such applicable warranty would be the sole responsibility of FaceApp. However, you understand and agree that in accordance with this Agreement, FaceApp has disclaimed all warranties of any kind with respect to the App, and therefore, there are no warranties applicable to the App.
Product Claims. You and FaceApp acknowledge that as between Apple and FaceApp, FaceApp, not Apple, is responsible for addressing any claims relating to the App or your possession and/or use of the App, including, but not limited to (a) product liability claims, (b) any claim that the App fails to conform to any applicable legal or regulatory requirement, and (c) claims arising under consumer protection or similar legislation.
Intellectual Property Rights. The parties acknowledge that, in the event of any third party claim that the App or your possession and use of the App infringe that third partys intellectual property rights, FaceApp, and not Apple, will be solely responsible for the investigation, defense, settlement and discharge of any such intellectual property infringement claim to the extent required under this Agreement.
Legal Compliance. You represent and warrant that (a) you are not located in a country that is subject to a U.S. Government embargo, or that has been designated by the U.S. Government as a “terrorist supporting” country, and (b) you are not listed on any U.S. Government list of prohibited or restricted parties.
Developer Name and Address. Any questions, complaints or claims with respect to the App should be directed to:
FaceApp Inc
1000 N West Street, Suite 1200,
Wilmington, Delaware, 19801
USA
contact@faceapp.com
Third-Party Terms of Agreement. You agree to comply with any applicable third-party terms when using the Services.
Third-Party Beneficiary. The parties acknowledge and agree that Apple, and Apples subsidiaries, are third-party beneficiaries of this Agreement, and that, upon your acceptance of this Agreement, Apple will have the right (and will be deemed to have accepted the right) to enforce this Agreement against you as a third-party beneficiary thereof).
20. Export
You may not use, export, import, or transfer all or any portion of the Services except as authorized by U.S. law, the laws of the jurisdiction in which you obtained the Services, and any other applicable laws. In particular, but without limitation, the Services may not be exported or re-exported (a) into any United States embargoes countries, or (b) to anyone on the U.S. Treasury Departments list of Specially Designated Nationals or the U.S. Department of Commerces Denied Persons List or Entity List. By using the Services, you represent and warrant that (y) you are not located in a country that is subject to a U.S. Government embargo, or that has been designated by the U.S. Government as a “terrorist supporting” country and (z) you are not listed on any U.S. Government list of prohibited or restricted parties. You also will not use the Services for any purpose prohibited by U.S. law, including the development, design, manufacture or production of missiles, nuclear, chemical or biological weapons. You acknowledge and agree that products, services or technology provided by FaceApp are subject to the export control laws and regulations of the United States. You shall comply with these laws and regulations and shall not, without prior U.S. government authorization, export, re-export, or transfer FaceApp products, services or technology, either directly or indirectly, to any country in violation of such laws and regulations.
21. Miscellaneous
In accordance with California Civil Code section 1789.3, you may report complaints to the Complaint Assistance Unit of the Division of Consumer Services of the California Department of Consumer Affairs by contacting them in writing at 400 R Street, Sacramento, CA 95814, or by telephone at (800) 952-5210. This Agreement constitutes the entire agreement between you and FaceApp relating to your access to and use of our Services. The failure of FaceApp to exercise or enforce any right or provision of this Agreement will not operate as a waiver of such right or provision. The section titles in this Agreement is for convenience only and have no legal or contractual effect. Except as otherwise provided herein, this Agreement is intended solely for the benefit of the parties and are not intended to confer third party beneficiary rights upon any other person or entity.
Privacy Policy
Personal Information We Collect
When you use the App, we may collect information about you, including:
Photographs you provide when you use the App, via your camera or camera roll (if you have granted us permission to access your camera or camera roll), the in-App internet search functionality, or your social media account (if you choose to connect your social media account). We obtain only the specific images you chose to modify using the App; we do not collect your photo albums even if you grant us your access to them. We encrypt each photograph that you upload using the App. The encryption key is stored locally on your device. This means that the only device that can view the photo is the device from which the photograph was uploaded using the App the users device. Please note that while we do not require or request any metadata attached to the photographs you upload, metadata (including, for example, geotags) may be associated with your photographs by default. We take steps to delete any metadata that may be associated with a photograph you provide when you use the App.
App usage information, such as information about how you use the App and interact with us, including your preferred language, the date and time when you first installed the App and the date and time you last used the App.
Purchase history, if you choose to purchase an App subscription, such as confirmation that you are a paid subscriber to the App.
Social media information, if you choose to login to the App via a third-party platform or social media network (for example, Facebook), or otherwise connect your account on the third-party platform or network to the App. We may collect information from that platform or network, such as your social media alias, first and last name, number of “friends” on the social media platform and, if depending on your Facebook or other network settings, a list of your friends or connections (though we do not use or store this information). Our collection and processing of the information we obtain from social media platforms is governed by the requirements these social media platforms impose on us in their relevant terms and conditions.
Device data, such as your computer and mobile device operating system type and version number, manufacturer and model, device ID, push tokens, Google Advertising ID, Apple ID for Advertising, browser type, screen resolution, IP address (and the associated country in which you are located), the website you visited before visiting our Site; and other information about the device you are using to visit the App.
Online activity data, such as information about your use of and actions on the App and the Sites, including pages or screens you viewed, how long you spent on a page or screen, navigation paths between pages or screens, information about your activity on a page or screen, access times, and length of access. Our service providers and certain third parties (e.g., online advertising networks and their clients) also may collect this type of information over time and across third-party websites and mobile applications. This information may be collected on our Site using cookies, browser web storage (also known as locally stored objects, or “LSOs”), web beacons, and similar technologies. We may collect this information directly or through our use of third-party software development kits (“SDKs”). SDKs may enable third parties to collect information directly from our App.
How We Use Your Personal Information
We do not use the photographs you provide when you use the App for any reason other than to provide you with the portrait editing functionality of the App. We may use information other than photographs for the following purposes:
To operate and improve the App:
Enable you to use the Apps features;
Establish and maintain your account, if you choose to login to the App using your social media account;
Communicate with you about the App, including by sending you announcements, updates, and security alerts, which we may send through a push notification, and responding to your requests, questions and feedback;
Provide technical support and maintenance for the App; and
Perform statistical analysis about use of the App (including throught the use of Google Analytics).
To send you marketing and promotional communications. We may send you marketing communications as permitted by law. You will have the ability to opt-out of our marketing and promotional communications as described in the Opt out of marketing section below.
To display advertisements to you. If you use the free version of the App, we work with advertising partners to display advertisements within the App. These advertisements are delivered by our advertising partners and may be targeted based on your use of the App or your activity elsewhere online. To learn more about your choices in connection with advertisements, please see the section below titled “Targeted online advertising.”
For compliance, fraud prevention, and safety. We may use your personal information and disclose it to law enforcement, government authorities, and private parties as we believe necessary or Appropriate to: (a) protect our, your or others rights, privacy, safety or property (including by making and defending legal claims); (b) enforce the terms and conditions that govern the Service; and (c) protect, investigate and deter against fraudulent, harmful, unauthorized, unethical or illegal activity.
With your consent. In some cases, we may specifically ask for your consent to collect, use or share your personal information, such as when required by law.
To create anonymous, aggregated or de-identified data. We may create anonymous, aggregated or de-identified data from your personal information and other individuals whose personal information we collect. We make personal information into anonymous, aggregated or de-identified data by removing information that makes the data personally identifiable to you. We may use this anonymous, aggregated or de-identified data and share it with third parties for our lawful business purposes.
How We Share Your Personal Information
We do not disclose user photographs to third parties (with the exception of uploading an encrypted image to our cloud providers Google Cloud Platform and Amazon Web Services to provide the photo editing features of the App). We may share your non-photograph information in the following circumstances:
Affiliates. We may share App usage information with our subsidiaries and affiliates, for purposes consistent with this Privacy Policy.
Service providers. We may share your personal information with services providers that perform services on our behalf or help us operate the App (such as customer support, hosting, analytics, email delivery, marketing, and database management services). These third parties may use your personal information only as directed or authorized by us and in a manner consistent with this Privacy Policy, and are prohibited from using or disclosing your information for any other purpose.
Advertising partners. When we use third-party cookies and other tracking tools, our advertising partners may collect information from your device to help us analyze use of the Site and the App, display advertisements on the App and advertise the Site and App (and related content) elsewhere online.
Third-party platforms and social media networks. If you have enabled features or functionality that connect the App to a third-party platform or social media network (such as by logging into FaceApp using your account with the third-party, providing your API key or similar access token for the App to a third-party, or otherwise linking your account with the App to a third-partys services), we may disclose the personal information that you authorized us to share (such as when you elect to upload a photograph to your social media account). We do not control the third-party platforms use of your personal information, which is governed by that third partys privacy policy and terms and conditions.
Professional advisors. We may disclose your personal information to professional advisors, such as lawyers, bankers, auditors and insurers, where necessary in the course of the professional services that they render to us.
For compliance, fraud prevention and safety. We may share your personal information for the compliance, fraud prevention and safety purposes described above.
Business transfers. We may sell, transfer or otherwise share some or all of our business or assets, including your personal information, in connection with a business transaction (or potential business transaction) such as a corporate divestiture, merger, consolidation, acquisition, reorganization or sale of assets, or in the event of bankruptcy or dissolution.
Compliance with Law
We may be required to use and share your personal information to comply with applicable laws, lawful requests, and legal process, such as to respond to subpoenas or requests from government authorities.
Your Choices
In this section, we describe the rights and choices available to all users. Users who are located within European can find additional information about their rights below.
Opt out of marketing communications and other push notifications. You may opt out of marketing-related communications and other notifications we may send you via push notification by changing the settings on your mobile device.
Device permissions. You may revoke any permissions you previously granted to us, such as permission to access your camera or camera roll, through the settings on your mobile device.
Cloud processing. You may request that we remove your information, including photographs, from the cloud before the 24-48 hour period after which Google Cloud Platform or Amazon Web Services automatically deletes the information by clicking the “Request cloud data removal” button in the “Support” section of the App Settings on your mobile device.
Cookies & Browser Web Storage. Most browsers let you remove or reject cookies. To do this, follow the instructions in your browser settings. Many browsers accept cookies by default until you change your settings. Please note that if you set your browser to disable cookies, the Site may not work properly. Similarly, your browser settings may allow you to clear your browser web storage.
Targeted online advertising. Some of the business partners that collect information about users activities on or through the Site or App may be members of organizations or programs that provide choices to individuals regarding the use of their browsing behavior or mobile application usage for purposes of targeted advertising.
Site users may opt out of receiving targeted advertising on websites through members of the Network Advertising Initiative by clicking here or the Digital Advertising Alliance by clicking here. App users may opt out of receiving targeted advertising in mobile apps through participating members of the Digital Advertising Alliance by installing the AppChoices mobile app, available here, and selecting the users choices. Please note that we also may work with companies that offer their own opt-out mechanisms and may not participate in the opt-out mechanisms that we linked above.
In addition, your mobile device settings may provide functionality to limit our, or our partners, ability to engage in ad tracking or targeted advertising using the Google Advertising ID or Apple ID for Advertising associated with your mobile device.
If you choose to opt-out of targeted advertisements, you will still see advertisements online but they may not be relevant to you. Even if you do choose to opt out, not all companies that serve online behavioral advertising are included in this list, so you may still receive some cookies and tailored advertisements from companies that are not listed.
Choosing not to share your personal information. Where we are required by law to collect your personal information, or where we need your personal information in order to provide the App to you, if you do not provide this information when requested (or you later ask to delete it), we may not be able to provide you with our services. We will tell you what information you must provide to use the App by designating it as required at the time of collection or through other appropriate means.
Third-party platforms or social media networks. If you choose to connect to the App via a third-party platform or social media network, such as by using Facebook login, you may have the ability to limit the information that we may obtain from the third-party at the time you login to the App using the third-partys authentication service or otherwise connect your account. Subsequently, you may be able to control your settings through the third-partys platform or service. For example, you may access and change your settings through the Facebook settings page for Apps and Websites. If you withdraw our ability to access certain information from a third-party platform or social media network, that choice will not apply to information that we have already received from that third party.
Other Sites, Mobile Applications and Services
The App may contain links to other websites, mobile applications, and other online services operated by third parties. These links are not an endorsement of, or representation that we are affiliated with, any third party. In addition, our content may be included on web pages or in mobile applications or online services that are not associated with us. We do not control third party websites, mobile applications or online services, and we are not responsible for their actions. Other websites, mobile applications and online services follow different rules regarding the collection, use and sharing of your personal information. We encourage you to read the privacy policies of the other websites, mobile applications and online services you use.
Security Practices
We use commercially reasonable security practices to help keep the information collected through the App secure and take reasonable steps to verify your identity before granting you access to your account (if you have an account with us). However, FaceApp cannot ensure the security of any information you transmit to FaceApp or guarantee that information on the App may not be accessed, disclosed, altered, or destroyed.
Please do your part to help us. You are responsible for maintaining the confidentiality of your login information and device identifiers, and for controlling access to communications between you and FaceApp, at all times. Your privacy settings may also be affected by changes the social media services you connect to FaceApp make to their services. We are not responsible for the functionality, privacy, or security measures of any other organization.
Retention
We configure Google Cloud Platform and Amazon Web Services to delete photographs and photograph-related information within 24-48 hours after the photograph was last edited using the App. This allows you to revisit the image for additional modifications during that time.
With respect to non-photograph information that we may collect, we will retain such information in a personally identifiable format only for as long as necessary to fulfill the purposes we have set out in this Privacy Policy. You may also ask that we delete your information using the “Request cloud data removal” button as described above or by contacting us.
Cross-Border Data Transfers
We store the information we collect in connection with the App on Amazon Web Services and Google Cloud Platform. For Amazon Web Services, we specify the US as the data storage location, for Google Cloud Platform, we specify data storage at an available location closest to you when you use the App. Your personal information may be accessed by our service providers in other locations outside of your state, province, or country. Your device ID (and general App usage information) may also be accessed by the Companys technical support team in other locations outside of your state, province, or country. We rely on the Privacy Shield, as described below, for transfers of data from the EU and Switzerland to FaceApp in the United States.
EU-U.S. Privacy Shield and Swiss-U.S. Privacy Shield
FaceApp Inc is the US entity that publishes and hosts the App. FaceApp Inc complies with the EU-U.S. and the Swiss-U.S. Privacy Shield Frameworks as set forth by the U.S. Department of Commerce regarding the collection, use, and retention of personal information transferred from the European Union and Switzerland to the United States. FaceApp Inc has submitted its certification to the Department of Commerce that it adheres to the Privacy Shield Principles. If there is any conflict between the terms in this Privacy Policy and the Privacy Shield Principles, the Privacy Shield Principles shall govern. To learn more about the Privacy Shield program, and to view our certification, please visit www.privacyshield.gov.
FaceApp Inc may transfer your personal information to third parties as described in this Privacy Policy. FaceApp Inc maintains contracts with its third-party service providers restricting their access, use and disclosure of personal information in compliance with our Privacy Shield obligations. FaceApp Inc may be liable if these third parties fail to meet those obligations and we are responsible for the event giving rise to the damage.
In compliance with the Privacy Shield Principles, FaceApp Inc commits to resolve complaints about our collection or use of your personal information. European individuals with inquiries or complaints regarding our Privacy Policy should first contact FaceApp Inc at privacy@faceapp.com. FaceApp Inc has further committed to refer unresolved Privacy Shield complaints to JAMS, an alternative dispute resolution provider located in the United States. If you do not receive timely acknowledgment of your complaint from us, or if we have not resolved your complaint, please visit www.jamsadr.com/eu-us-privacy-shield for more information or to file a complaint. The services of JAMS are provided at no cost to you. If neither FaceApp Inc nor JAMS resolves your complaint, you may have the ability to engage in binding arbitration through the Privacy Shield Panel. Additional information on the arbitration process is available on the Privacy Shield website at www.privacyshield.gov.
FaceApp Inc may be required to disclose personal data in response to lawful requests by public authorities, including to meet national security or law enforcement requirements. The Federal Trade Commission has jurisdiction over FaceApp Incs compliance with the Privacy Shield. FaceApp Incs commitments under the Privacy Principles are subject to the investigatory and enforcement powers of the Federal Trade Commission.

@ -1,75 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
*.pyc
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
.coveralls.yml
*.cover
.hypothesis/
# Sphinx documentation
docs/_build/
*.dev*
*.nja
build
dist
# Environments
.env
.venv
env/
venv/
ENV/
# Flymake
*_flymake.py
# Pattern specific ignore pattern
pattern/web/cache/tmp/
web/cache/tmp/
pattern_unittest_db
test/pattern_unittest_db
.DS_Store

@ -1,249 +0,0 @@
[MASTER]
# Specify a configuration file.
#rcfile=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Profiled execution.
profile=no
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS, feed, json, pdf, soup, pywordnet, svm
# Pickle collected data for later comparisons.
persistent=yes
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
[MESSAGES CONTROL]
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time.
#enable=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once).
disable=C0103,W0142,E1103
[REPORTS]
# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html
output-format=text
# Include message's id in output
include-ids=yes
# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]".
files-output=no
# Tells whether to display a full report or only the messages
reports=yes
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Add a comment according to your evaluation note. This is used by the global
# evaluation report (RP0004).
comment=no
[BASIC]
# Required attributes for module, separated by a comma
required-attributes=
# List of builtins function names that should not be used, separated by a comma
bad-functions=map,filter,apply,input
# Regular expression which should only match correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression which should only match correct module level names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Regular expression which should only match correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Regular expression which should only match correct function names
function-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct method names
method-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct instance attribute names
attr-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct argument names
argument-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct variable names
variable-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct list comprehension /
# generator expression variable names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Regular expression which should only match functions or classes name which do
# not require a docstring
no-docstring-rgx=__.*__
[FORMAT]
# Maximum number of characters on a single line.
max-line-length=100
# Maximum number of lines in a module
max-module-lines=1000
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[SIMILARITIES]
# Minimum lines number of a similarity.
min-similarity-lines=4
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
[TYPECHECK]
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# List of classes names for which member attributes should not be checked
# (useful for classes with attributes dynamically set).
ignored-classes=SQLObject
# When zope mode is activated, add a predefined set of Zope acquired attributes
# to generated-members.
zope=no
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E0201 when accessed. Python regular
# expressions are accepted.
generated-members=REQUEST,acl_users,aq_parent
[VARIABLES]
# Tells whether we should check for unused import in __init__ files.
init-import=no
# A regular expression matching the beginning of the name of dummy variables
# (i.e. not used).
dummy-variables-rgx=_|dummy
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
[CLASSES]
# List of interface methods to ignore, separated by a comma. This is used for
# instance to not check methods defines in Zope's Interface base class.
ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
[DESIGN]
# Maximum number of arguments for function / method
max-args=5
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of branch for function / method body
max-branchs=12
# Maximum number of statements in function / method body
max-statements=50
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
[IMPORTS]
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,string,TERMIOS,Bastion,rexec
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception

@ -1,44 +0,0 @@
language: python
dist: precise
python:
- "3.6"
before_install:
- export TZ=Europe/Brussels
- if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; else wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; fi
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- conda update --yes conda
- conda install --yes numpy scipy
- pip install --quiet pytest pytest-cov pytest-xdist chardet
install:
- python setup.py install --quiet
- pip freeze
# Install and compile libsvm and liblinear
- sudo apt-get install -y build-essential
- git clone https://github.com/cjlin1/libsvm
- cd libsvm; make lib; sudo cp libsvm.so.2 /lib; sudo ln -s /lib/libsvm.so.2 /lib/libsvm.so; cd ..
- git clone https://github.com/cjlin1/liblinear
- cd liblinear; make lib; sudo cp liblinear.so.3 /lib; sudo ln -s /lib/liblinear.so.3 /lib/liblinear.so; cd ..
script:
- pytest --cov=pattern
after_script:
- pip install --quiet coveralls
- coveralls
branches:
only:
- development
notifications:
email: false
# You can connect to MySQL/MariaDB using the username "travis" or "root" and a blank password.
services:
- mysql

@ -1,29 +0,0 @@
Copyright (c) 2011-2013 University of Antwerp, Belgium
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Pattern nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

@ -1,160 +0,0 @@
Pattern
=======
[![Build Status](http://img.shields.io/travis/clips/pattern/master.svg?style=flat)](https://travis-ci.org/clips/pattern/branches)
[![Coverage](https://img.shields.io/coveralls/clips/pattern/master.svg?style=flat)](https://coveralls.io/github/clips/pattern?branch=master)
[![PyPi version](http://img.shields.io/pypi/v/pattern.svg?style=flat)](https://pypi.python.org/pypi/pattern)
[![License](https://img.shields.io/badge/License-BSD%203--Clause-green.svg?style=flat)](https://github.com/clips/pattern/blob/master/LICENSE.txt)
Pattern is a web mining module for Python. It has tools for:
* Data Mining: web services (Google, Twitter, Wikipedia), web crawler, HTML DOM parser
* Natural Language Processing: part-of-speech taggers, n-gram search, sentiment analysis, WordNet
* Machine Learning: vector space model, clustering, classification (KNN, SVM, Perceptron)
* Network Analysis: graph centrality and visualization.
It is well documented, thoroughly tested with 350+ unit tests and comes bundled with 50+ examples. The source code is licensed under BSD and available from <http://www.clips.ua.ac.be/pages/pattern>.
![Example workflow](https://raw.githubusercontent.com/clips/pattern/master/docs/g/pattern_schema.gif)
Example
-------
This example trains a classifier on adjectives mined from Twitter using Python 3. First, tweets that contain hashtag #win or #fail are collected. For example: *"$20 tip off a sweet little old lady today #win"*. The word part-of-speech tags are then parsed, keeping only adjectives. Each tweet is transformed to a vector, a dictionary of adjective → count items, labeled `WIN` or `FAIL`. The classifier uses the vectors to learn which other tweets look more like `WIN` or more like `FAIL`.
```python
from pattern.web import Twitter
from pattern.en import tag
from pattern.vector import KNN, count
twitter, knn = Twitter(), KNN()
for i in range(1, 3):
for tweet in twitter.search('#win OR #fail', start=i, count=100):
s = tweet.text.lower()
p = '#win' in s and 'WIN' or 'FAIL'
v = tag(s)
v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
v = count(v) # {'sweet': 1}
if v:
knn.train(v, type=p)
print(knn.classify('sweet potato burger'))
print(knn.classify('stupid autocorrect'))
```
Installation
------------
Pattern supports Python 2.7 and Python 3.6. To install Pattern so that it is available in all your scripts, unzip the download and from the command line do:
```bash
cd pattern-3.6
python setup.py install
```
If you have pip, you can automatically download and install from the [PyPI repository](https://pypi.python.org/pypi/Pattern):
```bash
pip install pattern
```
If none of the above works, you can make Python aware of the module in three ways:
- Put the pattern folder in the same folder as your script.
- Put the pattern folder in the standard location for modules so it is available to all scripts:
* `c:\python36\Lib\site-packages\` (Windows),
* `/Library/Python/3.6/site-packages/` (Mac OS X),
* `/usr/lib/python3.6/site-packages/` (Unix).
- Add the location of the module to `sys.path` in your script, before importing it:
```python
MODULE = '/users/tom/desktop/pattern'
import sys; if MODULE not in sys.path: sys.path.append(MODULE)
from pattern.en import parsetree
```
Documentation
-------------
For documentation and examples see the [user documentation](http://www.clips.ua.ac.be/pages/pattern). If you are a developer, go check out the [developer documentation](http://www.clips.ua.ac.be/pages/pattern-dev).
Version
-------
3.6
License
-------
**BSD**, see `LICENSE.txt` for further details.
Reference
---------
De Smedt, T., Daelemans, W. (2012). Pattern for Python. *Journal of Machine Learning Research, 13*, 20312035.
Contribute
----------
The source code is hosted on GitHub and contributions or donations are welcomed. Please have look at the [developer documentation](http://www.clips.ua.ac.be/pages/pattern-dev). If you use Pattern in your work, please cite our reference paper.
Bundled dependencies
--------------------
Pattern is bundled with the following data sets, algorithms and Python packages:
- **Brill tagger**, Eric Brill
- **Brill tagger for Dutch**, Jeroen Geertzen
- **Brill tagger for German**, Gerold Schneider & Martin Volk
- **Brill tagger for Spanish**, trained on Wikicorpus (Samuel Reese & Gemma Boleda et al.)
- **Brill tagger for French**, trained on Lefff (Benoît Sagot & Lionel Clément et al.)
- **Brill tagger for Italian**, mined from Wiktionary
- **English pluralization**, Damian Conway
- **Spanish verb inflection**, Fred Jehle
- **French verb inflection**, Bob Salita
- **Graph JavaScript framework**, Aslak Hellesoy & Dave Hoover
- **LIBSVM**, Chih-Chung Chang & Chih-Jen Lin
- **LIBLINEAR**, Rong-En Fan et al.
- **NetworkX centrality**, Aric Hagberg, Dan Schult & Pieter Swart
- **spelling corrector**, Peter Norvig
Acknowledgements
----------------
**Authors:**
- Tom De Smedt (tom@organisms.be)
- Walter Daelemans (walter.daelemans@ua.ac.be)
**Contributors (chronological):**
- Frederik De Bleser
- Jason Wiener
- Daniel Friesen
- Jeroen Geertzen
- Thomas Crombez
- Ken Williams
- Peteris Erins
- Rajesh Nair
- F. De Smedt
- Radim Řehůřek
- Tom Loredo
- John DeBovis
- Thomas Sileo
- Gerold Schneider
- Martin Volk
- Samuel Joseph
- Shubhanshu Mishra
- Robert Elwell
- Fred Jehle
- Antoine Mazières + fabelier.org
- Rémi de Zoeten + closealert.nl
- Kenneth Koch
- Jens Grivolla
- Fabio Marfia
- Steven Loria
- Colin Molter + tevizz.com
- Peter Bull
- Maurizio Sambati
- Dan Fu
- Salvatore Di Dio
- Vincent Van Asch
- Frederik Elwert

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 280 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 187 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 429 B

@ -1,474 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>mbsp-tags</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/mbsp-tags" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/mbsp-tags</a></div>
<h1>Penn Treebank II tag set</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1274" class="node node-type-page"><div class="node-inner">
<div class="content">
<p class="big"><a href="pattern.html">Pattern</a> and&nbsp;<a href="http://www.clips.ua.ac.be/pages/MBSP" target="_self">MBSP</a> assign meaningful tags to words and groups of words in a sentence. Each tag is a short code (such as "<span class="postag">DT</span>" for "determiner").</p>
<p>The tag set is based on the Penn Treebank Tagging Guidelines [<a href="ftp://ftp.cis.upenn.edu/pub/treebank/doc/tagguide.ps.gz" target="_self">pdf</a>].</p>
<h3>Part-of-speech tags</h3>
<p>Part-of-speech tags are assigned to a single word according to its role in the sentence. Traditional grammar classifies words based on eight parts of speech: the verb (<span class="postag">VB</span>), the noun (<span class="postag">NN</span>), the pronoun (<span class="postag">PR</span>+<span class="postag">DT</span>), the adjective (<span class="postag">JJ</span>), the adverb (<span class="postag">RB</span>), the preposition (<span class="postag">IN</span>), the conjunction (<span class="postag">CC</span>), and the interjection (<span class="postag">UH</span>).</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Tag </span></td>
<td><span class="smallcaps">Description </span></td>
<td class="smallcaps">Example</td>
</tr>
<tr>
<td><span class="postag">CC </span></td>
<td>conjunction, coordinating</td>
<td><em>and, or, but</em></td>
</tr>
<tr>
<td><span class="postag">CD </span></td>
<td>cardinal number</td>
<td><em>five, three, 13%</em></td>
</tr>
<tr>
<td><span class="postag">DT </span></td>
<td>determiner</td>
<td><em>the, a, these <br /></em></td>
</tr>
<tr>
<td><span class="postag">EX </span></td>
<td>existential there</td>
<td><em><span style="text-decoration: underline;">there</span> were six boys <br /></em></td>
</tr>
<tr>
<td><span class="postag">FW </span></td>
<td>foreign word</td>
<td><em>mais <br /></em></td>
</tr>
<tr>
<td><span class="postag">IN </span></td>
<td>conjunction, subordinating or preposition</td>
<td><em>of, on, before, unless <br /></em></td>
</tr>
<tr>
<td><span class="postag">JJ </span></td>
<td>adjective</td>
<td><em>nice, easy </em></td>
</tr>
<tr>
<td><span class="postag">JJR </span></td>
<td>adjective, comparative</td>
<td><em>nicer, easier</em></td>
</tr>
<tr>
<td><span class="postag">JJS </span></td>
<td>adjective, superlative</td>
<td><em>nicest, easiest <br /></em></td>
</tr>
<tr>
<td><span class="postag">LS </span></td>
<td>list item marker</td>
<td><em>&nbsp;</em></td>
</tr>
<tr>
<td><span class="postag">MD </span></td>
<td>verb, modal auxillary</td>
<td><em>may, should <br /></em></td>
</tr>
<tr>
<td><span class="postag">NN </span></td>
<td>noun, singular or mass</td>
<td><em>tiger, chair, laughter <br /></em></td>
</tr>
<tr>
<td><span class="postag">NNS </span></td>
<td>noun, plural</td>
<td><em>tigers, chairs, insects <br /></em></td>
</tr>
<tr>
<td><span class="postag">NNP </span></td>
<td>noun, proper singular</td>
<td><em>Germany, God, Alice <br /></em></td>
</tr>
<tr>
<td><span class="postag">NNPS </span></td>
<td>noun, proper plural</td>
<td><em>we met two <span style="text-decoration: underline;">Christmases</span> ago <br /></em></td>
</tr>
<tr>
<td><span class="postag">PDT </span></td>
<td>predeterminer</td>
<td><em><span style="text-decoration: underline;">both</span> his children <br /></em></td>
</tr>
<tr>
<td><span class="postag">POS</span></td>
<td>possessive ending</td>
<td><em>'s</em></td>
</tr>
<tr>
<td><span class="postag">PRP </span></td>
<td>pronoun, personal</td>
<td><em>me, you, it <br /></em></td>
</tr>
<tr>
<td><span class="postag">PRP$ </span></td>
<td>pronoun, possessive</td>
<td><em>my, your, our <br /></em></td>
</tr>
<tr>
<td><span class="postag">RB </span></td>
<td>adverb</td>
<td><em>extremely, loudly, hard&nbsp; <br /></em></td>
</tr>
<tr>
<td><span class="postag">RBR </span></td>
<td>adverb, comparative</td>
<td><em>better <br /></em></td>
</tr>
<tr>
<td><span class="postag">RBS </span></td>
<td>adverb, superlative</td>
<td><em>best <br /></em></td>
</tr>
<tr>
<td><span class="postag">RP </span></td>
<td>adverb, particle</td>
<td><em>about, off, up <br /></em></td>
</tr>
<tr>
<td><span class="postag">SYM </span></td>
<td>symbol</td>
<td><em>% <br /></em></td>
</tr>
<tr>
<td><span class="postag">TO </span></td>
<td>infinitival to</td>
<td><em>what <span style="text-decoration: underline;">to</span> do? <br /></em></td>
</tr>
<tr>
<td><span class="postag">UH </span></td>
<td>interjection</td>
<td><em>oh, oops, gosh <br /></em></td>
</tr>
<tr>
<td><span class="postag">VB </span></td>
<td>verb, base form</td>
<td><em>think <br /></em></td>
</tr>
<tr>
<td><span class="postag">VBZ </span></td>
<td>verb, 3rd person singular present</td>
<td><em>she <span style="text-decoration: underline;">thinks </span><br /></em></td>
</tr>
<tr>
<td><span class="postag">VBP </span></td>
<td>verb, non-3rd person singular present</td>
<td><em>I <span style="text-decoration: underline;">think </span><br /></em></td>
</tr>
<tr>
<td><span class="postag">VBD </span></td>
<td>verb, past tense</td>
<td><em>they <span style="text-decoration: underline;">thought </span><br /></em></td>
</tr>
<tr>
<td><span class="postag">VBN </span></td>
<td>verb, past participle</td>
<td><em>a <span style="text-decoration: underline;">sunken</span> ship <br /></em></td>
</tr>
<tr>
<td><span class="postag">VBG </span></td>
<td>verb, gerund or present participle</td>
<td><em><span style="text-decoration: underline;">thinking</span> is fun <br /></em></td>
</tr>
<tr>
<td><span class="postag">WDT </span></td>
<td><em>wh</em>-determiner</td>
<td><em>which, whatever, whichever <br /></em></td>
</tr>
<tr>
<td><span class="postag">WP </span></td>
<td><em>wh</em>-pronoun, personal</td>
<td><em>what, who, whom <br /></em></td>
</tr>
<tr>
<td><span class="postag">WP$</span></td>
<td><em>wh</em>-pronoun, possessive</td>
<td><em>whose, whosever <br /></em></td>
</tr>
<tr>
<td><span class="postag">WRB</span></td>
<td><em>wh</em>-adverb</td>
<td><em>where, when <br /></em></td>
</tr>
<tr>
<td><span class="postag">. </span></td>
<td>punctuation mark, sentence closer</td>
<td><em>.;?* <br /></em></td>
</tr>
<tr>
<td><span class="postag">, </span></td>
<td>punctuation mark, comma</td>
<td><em>, <br /></em></td>
</tr>
<tr>
<td><span class="postag">: </span></td>
<td>punctuation mark, colon</td>
<td><em>: <br /></em></td>
</tr>
<tr>
<td><span class="postag">( </span></td>
<td>contextual separator, left paren</td>
<td><em>( <br /></em></td>
</tr>
<tr>
<td><span class="postag">) </span></td>
<td>contextual separator, right paren</td>
<td><em>) <br /></em></td>
</tr>
</tbody>
</table>
<h3>Chunk tags</h3>
<p>Chunk tags are assigned to groups of words that belong together (i.e. phrases). The most common phrases are the noun phrase (<span class="postag">NP</span>, for example <em>the black cat</em>) and the verb phrase (<span class="postag">VP</span>, for example <em>is purring</em>).</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Tag </span></td>
<td><span class="smallcaps">Description </span></td>
<td><span class="smallcaps">Words </span></td>
<td><span class="smallcaps">Example </span></td>
<td align="right">%</td>
</tr>
<tr>
<td><span class="postag">NP </span></td>
<td>noun phrase<span class="postag">&nbsp;</span></td>
<td><span class="postag">DT</span>+<span class="postag">RB</span>+<span class="postag">JJ</span>+<span class="postag">NN</span> + <span class="postag">PR</span></td>
<td><em>the strange bird</em></td>
<td align="right">&nbsp;51</td>
</tr>
<tr>
<td><span class="postag">PP </span></td>
<td>prepositional phrase</td>
<td><span class="postag">TO</span>+<span class="postag">IN </span></td>
<td><em>in between</em></td>
<td align="right">&nbsp;19</td>
</tr>
<tr>
<td><span class="postag">VP&nbsp; </span></td>
<td>verb phrase&nbsp;</td>
<td><span class="postag">RB</span>+<span class="postag">MD</span>+<span class="postag">VB&nbsp; </span></td>
<td><em>was looking<br /></em></td>
<td align="right">9</td>
</tr>
<tr>
<td><span class="postag">ADVP</span></td>
<td>adverb phrase</td>
<td><span class="postag">RB</span></td>
<td><em>also<br /></em></td>
<td align="right">&nbsp;6</td>
</tr>
<tr>
<td><span class="postag">ADJP</span></td>
<td>adjective phrase<span class="postag">&nbsp;</span></td>
<td><span class="postag">CC</span>+<span class="postag">RB</span>+<span class="postag">JJ</span></td>
<td><em>warm and cosy</em></td>
<td align="right">&nbsp;3</td>
</tr>
<tr>
<td><span class="postag">SBAR</span></td>
<td>subordinating conjunction&nbsp;</td>
<td><span class="postag">IN</span></td>
<td><em><span style="text-decoration: underline;">whether</span> or not<br /></em></td>
<td align="right">3</td>
</tr>
<tr>
<td><span class="postag">PRT </span></td>
<td>particle</td>
<td><span class="postag">RP</span></td>
<td><em><span style="text-decoration: underline;">up</span> the stairs</em></td>
<td align="right">&nbsp;1</td>
</tr>
<tr>
<td><span class="postag">INTJ</span></td>
<td>interjection</td>
<td><span class="postag">UH</span></td>
<td><em>hello</em><em><br /></em></td>
<td align="right">&nbsp;0</td>
</tr>
</tbody>
</table>
<p>The IOB prefix marks whether a word is inside or outside of a chunk.</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Tag </span></td>
<td><span class="smallcaps">Description </span></td>
</tr>
<tr>
<td><span class="postag">I-</span></td>
<td>inside the chunk</td>
</tr>
<tr>
<td><span class="postag">B-</span></td>
<td>inside the chunk, preceding word is part of a different chunk</td>
</tr>
<tr>
<td><span class="postag">O </span></td>
<td>not part of a chunk</td>
</tr>
</tbody>
</table>
<p>A prepositional noun phrase (<span class="postag">PNP</span>) is a group of chunks starting with a preposition (<span class="postag">PP</span>) followed by noun phrases (<span class="postag">NP</span>), for example: <em>under the table</em>.</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Tag </span></td>
<td><span class="smallcaps">Description </span></td>
<td class="smallcaps">Chunks</td>
<td><span class="smallcaps">Example </span></td>
</tr>
<tr>
<td><span class="postag">PNP</span></td>
<td>prepositional noun phrase</td>
<td><span class="postag">PP</span>+<span class="postag">NP</span><span class="postag"> </span></td>
<td><em>as of today</em></td>
</tr>
</tbody>
</table>
<h3>Relation tags</h3>
<p>Relations tags describe the relation between different chunks, and clarify the role of a chunk in that relation. The most common roles in a sentence are <span class="postag">SBJ</span> (subject noun phrase) and <span class="postag">OBJ</span> (object noun phrase). They link <span class="postag">NP</span> to <span class="postag">VP</span> chunks. The subject of a sentence is the person, thing, place or idea that is <em>doing</em> or <em>being</em> something. The object of a sentence is the person/thing affected by the action.</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Tag </span></td>
<td><span class="smallcaps">Description </span></td>
<td class="smallcaps">Chunks</td>
<td><span class="smallcaps">Example </span></td>
<td align="right"><span class="smallcaps">%</span></td>
</tr>
<tr>
<td><span class="postag">-SBJ</span></td>
<td>sentence subject</td>
<td><span class="postag">NP</span><span class="postag"> </span></td>
<td><em><span style="text-decoration: underline;">the cat</span> sat on the mat<br /></em></td>
<td align="right">35</td>
</tr>
<tr>
<td><span class="postag">-OBJ</span></td>
<td>sentence object</td>
<td><span class="postag">NP</span>+<span class="postag">SBAR</span></td>
<td><em>the cat grabs <span style="text-decoration: underline;">the fish</span><br /></em></td>
<td align="right">27</td>
</tr>
<tr>
<td><span class="postag">-PRD </span></td>
<td>predicate</td>
<td><span class="postag">PP</span>+<span class="postag">NP</span>+<span class="postag">ADJP </span></td>
<td><em>the cat feels <span style="text-decoration: underline;">warm and fuzzy</span><br /></em></td>
<td align="right">7</td>
</tr>
<tr>
<td><span class="postag">-TMP</span></td>
<td>temporal&nbsp;</td>
<td><span class="postag">PP</span>+<span class="postag">NP</span>+<span class="postag">ADVP</span></td>
<td><em>arrive </em><em><span style="text-decoration: underline;">at noon</span> <br /></em></td>
<td align="right">7</td>
</tr>
<tr>
<td><span class="postag">-CLR </span></td>
<td>closely related</td>
<td><span class="postag">PP</span>+<span class="postag">NP</span>+<span class="postag">ADVP </span></td>
<td><em>work </em><em><span style="text-decoration: underline;">as a researcher</span> <br /></em></td>
<td align="right">6</td>
</tr>
<tr>
<td><span class="postag">-LOC</span></td>
<td>location&nbsp;</td>
<td><span class="postag">PP&nbsp; </span></td>
<td><em>live </em><em><span style="text-decoration: underline;">in Belgium</span> <br /></em></td>
<td align="right">4</td>
</tr>
<tr>
<td><span class="postag">-DIR&nbsp; </span></td>
<td>direction</td>
<td><span class="postag">PP </span></td>
<td><em>walk</em><em> <span style="text-decoration: underline;">towards</span> the door<br /></em></td>
<td align="right">3</td>
</tr>
<tr>
<td><span class="postag">-EXT</span></td>
<td>extent</td>
<td><span class="postag">PP</span>+<span class="postag">NP </span></td>
<td><em>drop <span style="text-decoration: underline;">10 %</span><br /></em></td>
<td align="right">1</td>
</tr>
<tr>
<td><span class="postag">-PRP</span></td>
<td>purpose</td>
<td><span class="postag">PP</span>+<span class="postag">SBAR </span></td>
<td><em>die <span style="text-decoration: underline;">as a result</span> of <br /></em></td>
<td align="right">1</td>
</tr>
</tbody>
</table>
<h3>Anchor tags</h3>
<p>Anchor tags describe how prepositional noun phrases (<span class="postag">PNP</span>) are attached to other chunks in the sentence. For example, in the sentence, <em>I eat pizza with a fork</em>, the anchor of <em>with a fork</em> is <em>eat</em> because it answers the question: "In what way do I eat?"</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Tag </span></td>
<td><span class="smallcaps">Description </span></td>
<td><span class="smallcaps">Example </span></td>
</tr>
<tr>
<td><span class="postag">A1</span></td>
<td>anchor chunks that corresponds to <span class="postag">P1</span></td>
<td><em><span style="text-decoration: underline;">eat</span> with a fork<br /></em></td>
</tr>
<tr>
<td><span class="postag">P1 </span></td>
<td><span class="postag">PNP</span> that corresponds to <span class="postag">A1 </span></td>
<td><em>eat <span style="text-decoration: underline;">with a fork</span><br /></em></td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<p><strong>Occurence estimate </strong><span class="small"><br /></span></p>
<p><span class="small">The given percentages for chunk and relations tags are based on tenfold cross validation on sections 10 to 19 of the WSJ Corpus of the Penn Treebank II by Sabine Buchholz, from which we derived a rough indication. The estimate means that if a 100 chunk tags are found, about 50 would be <span class="postag">NP</span> tags and 35 would have a <span class="postag">SBJ</span> relation tag. About 30 of the chunks would be tagged as <span class="postag">NP-SBJ</span>, and 15 as <span class="postag">NP-OBJ</span>.&nbsp;</span></p>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: Buchholz, S. (2002). <em>Memory-Based Grammatical Relation Finding</em>. ILK, Tilburg University.</span></p>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

File diff suppressed because one or more lines are too long

@ -1,700 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-db</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-db" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-db</a></div>
<h1>pattern.db</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1432" class="node node-type-page"><div class="node-inner">
<div class="content">
<p class="big">The pattern.db module contains wrappers for databases (SQLite, MySQL), Unicode CSV files and Python's datetime. It offers a convenient way to work with tabular data, for example retrieved with the pattern.web module.</p>
<p>It can be used by itself or with other <a href="pattern.html">pattern</a> modules: <a href="pattern-web.html">web</a> | db | <a href="pattern-en.html">en</a> | <a href="pattern-search.html">search</a> <span class="blue"></span> | <a href="pattern-vector.html">vector</a> | <a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<ul style="margin-top: 0;">
<li><a href="#database">Database</a> <span class="smallcaps link-maintenance">(sqlite + mysql)</span></li>
<li><a href="#table">Table</a></li>
<li><a href="#query">Query</a></li>
<li><a href="#datasheet">Datasheet</a> <span class="smallcaps link-maintenance">(<a href="#csv">csv</a>)</span></li>
<li><a href="#date">Date</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="database"></a>Database</h2>
<p>A database is a collection of tables. A table has rows of data with a specific data type (e.g., string, float) for each field or column. A database engine provides an interface to the database, using <a href="https://en.wikipedia.org/wiki/SQL" target="_blank">SQL</a> statements (Structured Query Language). Python 2.5+ comes bundled with the SQLite engine. The <a href="http://www.mysql.com/" target="_blank">MySQL</a> engine requires the <a href="http://sourceforge.net/projects/mysql-python/" target="_blank">MySQL-Python</a> bindings. Note that a 32-bit Python requires a 32-bit MySQL.</p>
<p>The <span class="inline_code">Database()</span> constructor creates (if necessary) and returns an <span class="inline_code">SQLITE</span> or <span class="inline_code">MYSQL</span> database. With <span class="inline_code">SQLITE</span>, it will create a file with the given name in the current folder.</p>
<pre class="brush:python; gutter:false; light:true;">db = Database(
name,
host = 'localhost',
port = 3306,
username = 'root',
password = '',
type = SQLITE
)
</pre><pre class="brush:python; gutter:false; light:true;">db.type # SQLITE | MYSQL
db.name # Database name.
db.host # Database host (MySQL).
db.port # Database port (MySQL).
db.username # Database username (MySQL).
db.password # Database password (MySQL).
db.tables # Dictionary of (name, Table)-items.
db.relations # List of relations, see Database.link().
db.query # Last executed SQL query.
db.connected # True after Database.connect(). </pre><pre class="brush:python; gutter:false; light:true;">db.connect() # Happens automatically.
db.disconnect()</pre><pre class="brush:python; gutter:false; light:true;">db.create(table, fields=[])
db.remove(table)
db.link(table1, field1, table2, field2, join=LEFT) </pre><pre class="brush:python; gutter:false; light:true;">db.execute(SQL, commit=False)
db.commit()
db.escape(value) # "a cat's tail" =&gt; "'a cat\'s tail'"</pre><ul>
<li><span class="inline_code">Database.execute()</span> returns an iterator of rows for the given SQL query.</li>
<li><span class="inline_code">Database.commit()</span> commits the changes of pending <span class="inline_code">INSERT</span>, <span class="inline_code">UPDATE</span>, <span class="inline_code">DELETE</span> queries.</li>
<li><span class="inline_code">Database.escape()</span> safely quotes and escapes field values.</li>
</ul>
<h3>Create table</h3>
<p><span class="inline_code">Database.create()</span> creates a new table in the database, It takes a table name and a list of row fields, where each field is defined with the <span class="inline_code">field()</span> function. Each field has a <span class="inline_code">name</span> (a-z + underscores) and a <span class="inline_code">type</span>, with an optional <span class="inline_code">default</span> value for new rows. The <span class="inline_code">pk()</span> function can be used for primary keys.</p>
<pre class="brush:python; gutter:false; light:true;">field(name, type=STRING, default=None, index=False, optional=True)</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">pk(name='id') # field('id', INTEGER, index=PRIMARY, optional=False) </pre><table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Type</span></td>
<td><span class="smallcaps">Value</span></td>
<td><span class="smallcaps">Example</span></td>
</tr>
<tr>
<td><span class="inline_code">STRING</span></td>
<td><span class="inline_code">str</span>, <span class="inline_code">unicode</span> (1-255 characters)</td>
<td><span class="inline_code">u'Schrödinger'</span></td>
</tr>
<tr>
<td><span class="inline_code">INTEGER</span></td>
<td><span class="inline_code">int</span></td>
<td><span class="inline_code">42</span></td>
</tr>
<tr>
<td><span class="inline_code">FLOAT</span></td>
<td><span class="inline_code">float</span></td>
<td><span class="inline_code">3.14159</span></td>
</tr>
<tr>
<td><span class="inline_code">TEXT</span></td>
<td><span class="inline_code">str</span>, <span class="inline_code">unicode</span></td>
<td><span class="inline_code">open('file.txt').read() </span></td>
</tr>
<tr>
<td><span class="inline_code">BLOB</span></td>
<td><span class="inline_code">str</span> (binary, e.g., PDF, PNG)</td>
<td><span class="inline_code">db.binary(open('img.jpg',</span> <span class="inline_code">'rb').read())</span></td>
</tr>
<tr>
<td><span class="inline_code">BOOLEAN</span></td>
<td><span class="inline_code">bool</span></td>
<td><span class="inline_code">True</span>, <span class="inline_code">False</span></td>
</tr>
<tr>
<td><span class="inline_code">DATE</span></td>
<td><span class="inline_code">Date</span></td>
<td><span class="inline_code">date('1999-12-31 23:59:59')</span></td>
</tr>
</tbody>
</table>
<p>A <span class="inline_code">STRING</span> field can contain up to a 100 characters. The length (1-255) can be changed by calling <span class="inline_code">STRING</span> as a function, e.g., <span class="inline_code">type=STRING(255)</span>. For longer strings, use <span class="inline_code">TEXT</span>. The default value for a <span class="inline_code">DATE</span> field is <span class="inline_code">NOW</span>.</p>
<p>With <span class="inline_code">index=True</span>, the field is indexed for faster search. The index can also be set to <span class="inline_code">UNIQUE</span> (no duplicates) or <span class="inline_code">PRIMARY</span>. A table must have a primary key field that uniquely identifies each row (i.e., an id). Integer primary keys are auto-numbered, there is no need to set the value manually in new rows.</p>
<p>With <span class="inline_code">optional=True</span>, the field is allowed to contain <span class="inline_code">None</span>.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import Database, field, pk, STRING, BOOLEAN, DATE, NOW
&gt;&gt;&gt;
&gt;&gt;&gt; db = Database('my_stuff')
&gt;&gt;&gt; db.create('pets', fields=(
&gt;&gt;&gt; pk(),
&gt;&gt;&gt; field('name', STRING(80), index=True),
&gt;&gt;&gt; field('type', STRING(20)),
&gt;&gt;&gt; field('tail', BOOLEAN),
&gt;&gt;&gt; field('date_birth', DATE, default=None),
&gt;&gt;&gt; field('date_created', DATE, default=NOW)
&gt;&gt;&gt; ))</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.append(name=u'Schrödinger', type='cat', tail=True)
&gt;&gt;&gt; print db.pets.rows()[0]
(1, u'Schrödinger', u'cat', True, None, Date('2013-12-11 10:09:08'))</pre></div>
<h3>Create table from XML</h3>
<p><span class="inline_code">Database.create()</span> can also take a <span class="inline_code">Table.xml</span> or <span class="inline_code">Query.xml</span>. It creates a new table and copies the row data in the given XML string. An optional <span class="inline_code">name</span> parameter can be used to rename the new table. In <span class="inline_code">Query.xml</span>, a field name may contain a period. It will be replaced with an underscore (e.g., pets.name → pets_name). Alternatively, an alias can be defined in the <span class="inline_code">Query.aliases</span> dictionary.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="table"></a>Table</h2>
<p>A <span class="inline_code">Table</span> is a list of rows, with one or more fields (i.e., table columns) of a certain type (i.e., string or number). A new table can be created with <span class="inline_code">Database.create()</span>. A <span class="inline_code">TableError</span> is raised if a table with the given name exists. An existing table can be retrieved with <span class="inline_code">Database.tables[name]</span>, <span class="inline_code">Database[name]</span> or <span class="inline_code">Database.&lt;name&gt;</span>.</p>
<pre class="brush:python; gutter:false; light:true;">table = Database.tables[name]</pre><pre class="brush:python; gutter:false; light:true;">table.db # Parent Database.
table.name # Table name (a-z + underscores).
table.fields # List of field names (i.e., columns).
table.schema # Dictionary of (field, Schema)-items.
table.default # Dictionary of (field, value)-items for new rows.
table.pk # Primary key field name.</pre><pre class="brush:python; gutter:false; light:true;">table.count() # Total number of rows (len(table) also works).
table.rows() # List of rows, each a tuple of fields.
</pre><pre class="brush:python; gutter:false; light:true;">table.record(row) # Dictionary of (field, value)-items for given row.</pre><pre class="brush:python; gutter:false; light:true;">table.append(fields={}, commit=True)
table.update(id, fields={}, commit=True)
table.remove(id, commit=True)
</pre><pre class="brush:python; gutter:false; light:true;">table.filter(*args, **kwargs)
table.search(*args, **kwargs) </pre><pre class="brush:python; gutter:false; light:true;">table.xml # XML string with the table schema and rows.
table.datasheet # Datasheet object (see below).</pre><ul>
<li><span class="inline_code">Table.rows()</span> returns a list of all rows. To iterate rows memory-efficiently, use <span class="inline_code">iter(</span><span class="inline_code">Table)</span>.</li>
<li><span class="inline_code">Table.append()</span>, <span class="inline_code">update()</span> and <span class="inline_code">remove()</span> modify the table contents.<br />With <span class="inline_code">commit=False</span>, changes are only committed after <span class="inline_code">Database.commit()</span> (= faster in batch).</li>
<li><span class="inline_code">Table.filter()</span> returns a subset of rows with a subset of fields.<br />For example: <span class="inline_code">table.filter('name',</span> <span class="inline_code">type='cat')</span>.</li>
</ul>
<h3>Table schema</h3>
<p>The <span class="inline_code">Table.schema</span> dictionary contains field name → <span class="inline_code">Schema</span> items.</p>
<pre class="brush:python; gutter:false; light:true;">schema = Table.schema[fieldname]</pre><pre class="brush:python; gutter:false; light:true;">schema.name # Field name.
schema.type # STRING, INTEGER, FLOAT, TEXT, BLOB, BOOLEAN, DATE
schema.length # STRING field length.
schema.default # Default value.
schema.index # PRIMARY | UNIQUE | True | False
schema.optional # True or False. </pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import Database
&gt;&gt;&gt;
&gt;&gt;&gt; db = Database('my_stuff')
&gt;&gt;&gt;
&gt;&gt;&gt; print db.pets.fields
&gt;&gt;&gt; print db.pets.schema['name'].type
&gt;&gt;&gt; print db.pets.schema['name'].length
['id', 'name', 'tail', 'date_birth', 'date_created']
STRING
80 </pre></div>
<h3>Append row</h3>
<p><span class="inline_code">Table.append()</span> adds a new row with the given field values. It returns the row id, if the table has a primary key generated with <span class="inline_code">pk()</span>. Field values can be given as optional parameters, a dictionary or a tuple. Field values for a <span class="inline_code">BLOB</span> field must be wrapped in <span class="inline_code">Database.binary()</span>.<span style="color: #333333; font-family: Inconsolata, 'Courier New', Courier, monospace; font-size: small;"></span></p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.append(name=u'Schrödinger', date_birth=date('2009-08-12'))</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.append({'name': u'Schrödinger', 'date_birth': date('2009-08-12')}) </pre></div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; db.pets.append((u'Schrödinger', 'cat', True, date('2009-08-12')) # in-order</pre></div>
<h3>Update row</h3>
<p><span class="inline_code">Table.update()</span> updates values in the row with the given primary key. A batch of rows can be updated using a <a class="link-maintenance" href="#filter">filter</a>, or a chain of filters with <span class="inline_code">any()</span> or <span class="inline_code">all()</span>. In the last example, all rows with <span class="inline_code">type='cat'</span> will have their <span class="inline_code">tail</span> field set to <span class="inline_code">True</span>.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.update(1, type='cat') # set type='cat' in row with id=1.</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.update(1, {'type': 'cat'})</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.update(eq('type', 'cat'), tail=True) </pre></div>
<h3>Remove row</h3>
<p><span class="inline_code">Table.remove()</span> removes the row with the given primary key:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.remove(1)</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.remove(ALL)</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.remove(all(eq('type', 'cat'), lt(year('date_birth'), 1990, '&lt;')))</pre></div>
<p>The last example removes all rows that have <span class="inline_code">type='cat'</span> AND year of birth before 1990.</p>
<h3><span>Filter rows</span></h3>
<p><span class="inline_code">Table.filter()</span> returns a list of rows filtered by field value(s), where each row is a tuple of fields. The first parameter defines which fields to return. It can be a single field name, a list of field names or <span class="inline_code">ALL</span>. The following parameters are optional and define field constraints. They can also be given as a dictionary:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.filter('name') # all rows, name</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.filter(('id', 'name')) # all rows, name + id</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.filter(ALL, type='cat') # type='cat', all fields</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.filter(ALL, type=('cat', 'dog')) # type='cat' OR type='dog' </pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.filter(ALL, type='*at') # type='cat' OR 'hat' OR 'brat', ...</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.filter(ALL, type='cat', tail=True) # type='cat' AND tail=True </pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.pets.filter('id', {'type': 'cat', 'tail': True})
</pre></div>
<p>More complex queries can be constructed with a <span class="inline_code">Query</span>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="query"></a>Query</h2>
<p><span class="inline_code">Table.search()</span> returns a new <span class="inline_code">Query</span> with options for filtering, sorting and ordering rows by field value(s). It can include fields from other, related tables.</p>
<pre class="brush:python; gutter:false; light:true;">query = Table.search(
fields = ALL,
filters = [],
relations = [],
sort = None,
order = ASCENDING,
group = None,
function = FIRST,
range = None
)</pre><pre class="brush:python; gutter:false; light:true;">query.table # Parent Table.
query.fields # Field name, list of field names, or ALL.
query.aliases # Dictionary of (field name, alias)-items.
query.filters # List of filter() objects.
query.relations # List of rel() objects.
query.sort # Field name or list of field names.
query.order # ASCENDING | DESCENDING
query.group # Field name or list of field names.
query.function # FIRST, LAST, COUNT, MIN, MAX, SUM, AVG, CONCATENATE
query.range # (start, stop)-tuple, e.g. rows 11-20.</pre><pre class="brush:python; gutter:false; light:true;">query.sql() # SQL string, can be used with Database.execute().</pre><pre class="brush:python; gutter:false; light:true;">query.rows() # List of rows, each a tuple of fields.</pre><pre class="brush:python; gutter:false; light:true;">query.record(row) # Dictionary of (field, value)-items for given row.</pre><pre class="brush:python; gutter:false; light:true;">query.xml # XML string with the query schema and rows.</pre><p>To iterate rows memory-efficiently, use <span class="inline_code">iter(Query)</span> instead of <span class="inline_code">Query.rows()</span>.</p>
<h3><a name="filter"></a>Query filter</h3>
<p>The <span class="inline_code">filter()</span> function creates a field-value constraint that matches certain rows in a table. A list of filters can be passed to the <span class="inline_code">filters</span> parameter of a <span class="inline_code">Query</span>.</p>
<pre class="brush:python; gutter:false; light:true;">filter(field, value, comparison='=')</pre><table class="border">
<tbody>
<tr>
<td style="text-align: center;"><span class="smallcaps">Comparison</span></td>
<td><span class="smallcaps">Description</span></td>
<td><span class="smallcaps">Example</span></td>
<td style="text-align: center;"><span class="smallcaps">Alias</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">=</span></td>
<td>equal to</td>
<td><span class="inline_code">filter('type',</span> <span class="inline_code">('cat',</span> <span class="inline_code">'dog'),</span> <span class="inline_code">'=') </span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;eq()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">i=</span></td>
<td>equal to (case-insensitive)</td>
<td><span class="inline_code">filter('name',</span> <span class="inline_code">'tig*',</span> <span class="inline_code">'i=') </span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;eqi()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">!=</span></td>
<td>not equal to</td>
<td><span class="inline_code">filter('name',</span> <span class="inline_code">'*y',</span> <span class="inline_code">'!=')</span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;ne()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">&gt;</span></td>
<td>greater than</td>
<td><span class="inline_code">filter('weight',</span> <span class="inline_code">10,</span> <span class="inline_code">'&gt;') </span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;gt()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">&lt;</span></td>
<td>less than</td>
<td><span class="inline_code">filter('weight',</span> <span class="inline_code">10,</span> <span class="inline_code">'&lt;') </span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;lt()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">&gt;=</span></td>
<td>greater than or equal to</td>
<td><span class="inline_code">filter(year('date'),</span> <span class="inline_code">1999,</span> <span class="inline_code">'&gt;=') </span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;gte()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">&lt;=</span></td>
<td>less than or equal to</td>
<td><span class="inline_code">filter(year('date'),</span> <span class="inline_code">2002,</span> <span class="inline_code">'&lt;=')</span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;lte()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">:</span></td>
<td>between (inclusive)</td>
<td><span class="inline_code">filter(year('date'),</span> <span class="inline_code">(1999,</span> <span class="inline_code">2002),</span> <span class="inline_code">':')</span></td>
<td style="text-align: left;"><span class="inline_code">&nbsp;rng()</span></td>
</tr>
</tbody>
</table>
<p>The field name of a <span class="inline_code">DATE</span> field can be passed to the&nbsp;<span class="inline_code">year()</span>, <span class="inline_code">month()</span>, <span class="inline_code">day()</span>, <span class="inline_code">hour()</span>, <span class="inline_code">minute()</span> or <span class="inline_code">second()</span> function.The short aliases of <span class="inline_code">filter()</span> have a preset comparison operator.</p>
<h3>Query filter chain</h3>
<p>Filters can be chained together. The <span class="inline_code">all()</span> function returns a list with AND logic. The <span class="inline_code">any()</span> function returns a list with OR logic. In the example below, the first query matches <span style="text-decoration: underline;">all</span> cats named Taxi. The second and third query match <span style="text-decoration: underline;">any</span> pet that is cat OR that is named Taxi.</p>
<pre class="brush:python; gutter:false; light:true;">all(filter1, filter2, ...) # Rows must match ALL of the filters.</pre><pre class="brush:python; gutter:false; light:true;">any(filter1, filter2, ...) # Rows must match ANY of the filters.</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import Database, eq, all, any
&gt;&gt;&gt;
&gt;&gt;&gt; db = Database('my_stuff')
&gt;&gt;&gt;
&gt;&gt;&gt; db.pets.search(filters=all(eq('name', 'Taxi'), eq('type', 'cat')))
&gt;&gt;&gt; db.pets.search(filters=any(eq('name', 'Taxi'), eq('type', 'cat')))
&gt;&gt;&gt; db.pets.search(filters=any(name='Taxi', type='cat')) </pre></div>
<p>Lists created with <span class="inline_code">all()</span> and <span class="inline_code">any()</span> can be nested to define complex search criteria. The example below matches all pets that are cats, and whose name starts with Fluff- OR ends with a -y:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; f = any(eq('name', 'Fluff*'), eq('name', '*y')) # OR
&gt;&gt;&gt; f = all(eq('type', 'cat'), f) # AND
&gt;&gt;&gt;
&gt;&gt;&gt; for row in db.pets.search(filters=f):
&gt;&gt;&gt; print row</pre></div>
<p>The syntax can even be more concise:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; for row in db.pets.search(filters=all(name=('Fluff*', '*y'), type='cat')):
&gt;&gt;&gt; print row </pre></div>
<h3>Query relation</h3>
<p>The <span class="inline_code">rel()</span> function defines a relation between two fields in different tables (usually id's).</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">rel(field1, field2, table, join=LEFT) # LEFT | INNER</pre><p>The optional <span class="inline_code">join</span> parameter defines how rows are matched. <span class="inline_code">LEFT</span> takes all rows from the base table, with additional fields from the related table. For a row with no match between <span class="inline_code">field1</span> and <span class="inline_code">field2</span>, these fields have value <span class="inline_code">None</span>. <span class="inline_code">INNER</span> takes the subset of rows that have a match between <span class="inline_code">field1</span> and <span class="inline_code">field2</span>.</p>
<p>A well-known example is a database app that processes invoices. Say we have a products table and an orders table. Each order has a product id instead of all product details. Each product id can occur in multiple orders. This approach is called database normalization. It avoids duplicate data. To generate an invoice, we can combine product details and order details using a query relation.</p>
<p>The following example demonstrates a simple products + customers + orders database app:</p>
<table class="border=">
<tbody>
<tr>
<td>
<table class="border" style="margin: 0;">
<tbody>
<tr>
<td style="text-align: center;" colspan="3"><span class="smallcaps">products</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="smallcaps">id</span></td>
<td style="text-align: left;"><span class="smallcaps">name</span></td>
<td style="text-align: center;"><span class="smallcaps">price</span></td>
</tr>
<tr>
<td style="text-align: center;">1</td>
<td style="text-align: left;">pizza</td>
<td style="text-align: center;">15</td>
</tr>
<tr>
<td style="text-align: center;">2</td>
<td style="text-align: left;">garlic bread</td>
<td style="text-align: center;">3</td>
</tr>
</tbody>
</table>
</td>
<td>
<table class="border" style="margin: 0;">
<tbody>
<tr>
<td style="text-align: center;" colspan="3"><span class="smallcaps">customers</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="smallcaps">id</span></td>
<td style="text-align: left;"><span class="smallcaps">name</span></td>
</tr>
<tr>
<td style="text-align: center;">1</td>
<td style="text-align: left;">Schrödinger</td>
</tr>
<tr>
<td style="text-align: center;">2</td>
<td style="text-align: left;">Hofstadter</td>
</tr>
</tbody>
</table>
</td>
<td>
<table class="border" style="margin: 0;">
<tbody>
<tr>
<td style="text-align: center;" colspan="3"><span class="smallcaps">orders</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="smallcaps">id</span></td>
<td style="text-align: center;"><span class="smallcaps">product</span></td>
<td style="text-align: center;"><span class="smallcaps">customer</span></td>
</tr>
<tr>
<td style="text-align: center;">1</td>
<td style="text-align: center;">1</td>
<td style="text-align: center;">2</td>
</tr>
<tr>
<td>&nbsp;</td>
<td>&nbsp;</td>
<td>&nbsp;</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import Database, field, pk, INTEGER as I
&gt;&gt;&gt;
&gt;&gt;&gt; db = Database('pizza_delivery')
&gt;&gt;&gt;
&gt;&gt;&gt; db.create( 'products', (pk(), field('name'), field('price', I)))
&gt;&gt;&gt; db.create('customers', (pk(), field('name')))
&gt;&gt;&gt; db.create( 'orders', (pk(), field('product', I), field('customer', I)))</pre></div>
<div class="example">Add products and customers. Pizza delivery is open for business!</div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.products.append(name='pizza', price=15)
&gt;&gt;&gt; db.products.append(name='garlic bread', price=3)</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.customers.append(name=u'Schrödinger')
&gt;&gt;&gt; db.customers.append(name=u'Hofstadter')</pre></div>
<p>Hofstadter orders a pizza.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; db.orders.append(product=1, customer=2)</pre></div>
<div class="example">An orders query with relations to products and customers generates a human-readable invoice:</div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import Database, rel
&gt;&gt;&gt;
&gt;&gt;&gt; db = Database('pizza_delivery')
&gt;&gt;&gt;
&gt;&gt;&gt; f = ('orders.id', 'customers.name', 'products.name', 'products.price')
&gt;&gt;&gt; q = db.orders.search(f, relations=(
&gt;&gt;&gt; rel('orders.customer', 'customers.id', 'customers'),
&gt;&gt;&gt; rel('orders.product', 'products.id', 'products'))
&gt;&gt;&gt; )
&gt;&gt;&gt; for row in q:
&gt;&gt;&gt; print q.record(row)
{ 'orders.id' : 1,
'customers.name' : u'Hofstadter',
'products.name' : u'pizza',
'products.price' : 15 }</pre></div>
<div class="example">If a relation is used repeatedly, define it once with <span class="inline_code">Database.link()</span>. It will be available in every <span class="inline_code">Query</span>.</div>
<h3>Grouping rows</h3>
<p>A <span class="inline_code">Query</span> has an optional parameter <span class="inline_code">group</span> that can be used to merge rows on duplicate field values. The given <span class="inline_code">function</span> is applied to the other fields. It can also be a list with a function for each field.</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Function</span></td>
<td style="text-align: center;"><span class="smallcaps">Field type</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td><span class="inline_code">FIRST</span></td>
<td style="text-align: center;">any</td>
<td>The first row field in the group.</td>
</tr>
<tr>
<td><span class="inline_code">LAST</span></td>
<td style="text-align: center;">any</td>
<td>The last row field in the group.</td>
</tr>
<tr>
<td><span class="inline_code">COUNT</span></td>
<td style="text-align: center;">any</td>
<td>The number of rows in the group.</td>
</tr>
<tr>
<td><span class="inline_code">MIN</span></td>
<td style="text-align: center;"><span class="inline_code">INTEGER</span> + <span class="inline_code">FLOAT</span></td>
<td>The lowest field value in the group.</td>
</tr>
<tr>
<td><span class="inline_code">MAX</span></td>
<td style="text-align: center;"><span class="inline_code">INTEGER</span> + <span class="inline_code">FLOAT</span></td>
<td>The highest field value in the group.</td>
</tr>
<tr>
<td><span class="inline_code">SUM</span></td>
<td style="text-align: center;"><span class="inline_code">INTEGER</span> + <span class="inline_code">FLOAT</span></td>
<td>The sum of all field values in the group.</td>
</tr>
<tr>
<td><span class="inline_code">AVG</span></td>
<td style="text-align: center;"><span class="inline_code">INTEGER</span> + <span class="inline_code">FLOAT</span></td>
<td>The average of all field values in the group.</td>
</tr>
<tr>
<td><span class="inline_code">STDEV</span></td>
<td style="text-align: center;"><span class="inline_code">INTEGER</span> + <span class="inline_code">FLOAT</span></td>
<td>The standard deviation (= variation from average).</td>
</tr>
<tr>
<td><span class="inline_code">CONCATENATE</span></td>
<td style="text-align: center;"><span class="inline_code">STRING</span></td>
<td>Joins all field values with a comma.</td>
</tr>
</tbody>
</table>
<p>For example, to get the total revenue per ordered product:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; print db.orders.search(
&gt;&gt;&gt; fields = ('products.name', 'products.price'),
&gt;&gt;&gt; relations = rel('product', 'products.id', 'products'),
&gt;&gt;&gt; group = 'products.name', # Merge orders with same product name.
&gt;&gt;&gt; function = SUM # Sum of product prices.
&gt;&gt;&gt; ).rows()</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="datasheet"></a>Datasheet</h2>
<p>A <span class="inline_code">Datasheet</span> is a matrix of rows and columns, where each row and column can be retrieved as a list. The data can be imported or exported as a CSV-file. Optionally, the given <span class="inline_code">fields</span> is a list of <span class="inline_code">(name,</span> <span class="inline_code">type)</span> headers, where <span class="inline_code">type</span> can be <span class="inline_code">STRING</span>, <span class="inline_code">TEXT</span>, <span class="inline_code">INTEGER</span>, <span class="inline_code">FLOAT</span>, <span class="inline_code">BOOLEAN</span>, <span class="inline_code">BLOB</span> or <span class="inline_code">DATE</span>.</p>
<pre class="brush:python; gutter:false; light:true;">datasheet = Datasheet(rows=[], fields=None)</pre><pre class="brush:python; gutter:false; light:true;">datasheet = Datasheet.load(path, separator=',', decoder=lambda v: v, headers=False)
</pre><pre class="brush:python; gutter:false; light:true;">datasheet.rows # List of rows (each row = list of values).
datasheet.columns # List of columns (each column = list of values).
datasheet.fields # List of (name, type) column headers.
datasheet.&lt;field&gt; # List of column values. </pre><pre class="brush:python; gutter:false; light:true;">datasheet[i] # Row at index i.
datasheet[i, j] # Value in row i at column j.
datasheet[i1:i2, j] # Slice of column j from rows i1-i2.
datasheet[i, j1:j2] # Slice of columns j1-j2 from row i.
datasheet[i1:i2, j1:j2] # Datasheet with columns j1-j2 from rows i1-i2.
datasheet[:] # Datasheet copy. </pre><pre class="brush:python; gutter:false; light:true;">datasheet.insert(i, row, default=None)
datasheet.append(row, default=None)
datasheet.extend(rows, default=None)
datasheet.copy(rows=ALL, columns=ALL)</pre><pre class="brush:python; gutter:false; light:true;">datasheet.group(j, function=FIRST, key=lambda v: v)</pre><pre class="brush:python; gutter:false; light:true;">datasheet.save(path, separator=',', encoder=lambda v: v, headers=False)</pre><pre class="brush:python; gutter:false; light:true;">datasheet.json # JSON-formatted string.</pre><ul>
<li><span class="inline_code">Datasheet.insert()</span> and <span class="inline_code">append()</span> fill missing columns with the <span class="inline_code">default</span> value.</li>
<li><span class="inline_code">Datasheet.columns.insert()</span> and <span class="inline_code">append()</span> fill missing rows with the <span class="inline_code">default</span> value.<br />An optional <span class="inline_code">field</span> parameter can be used to supply a (<span class="inline_code">name</span>, <span class="inline_code">type</span>) column header.</li>
<li><span class="inline_code">Datasheet.copy()</span> returns a new <span class="inline_code">Datasheet</span> from a selective list of row and/or column indices.</li>
<li>To rotate a datasheet 90 degrees, use <span class="inline_code">datasheet</span> <span class="inline_code">=</span> <span class="inline_code">flip(datasheet)</span>.</li>
</ul>
<p>For example:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.db import Datasheet
&gt;&gt;&gt;
&gt;&gt;&gt; ds = Datasheet()
&gt;&gt;&gt; ds.append((u'Schrödinger', 'cat'))
&gt;&gt;&gt; ds.append((u'Hofstadter', 'cat'))
&gt;&gt;&gt; ds.save('pets.csv')
&gt;&gt;&gt;
&gt;&gt;&gt; ds = Datasheet.load('pets.csv')
&gt;&gt;&gt; print ds
[[u'Schrödinger', 'cat'],
[ u'Hofstadter', 'cat']]</pre></div>
<h3>Grouping rows</h3>
<p><span class="inline_code">Datasheet.group(j)</span> returns a new <span class="inline_code">Datasheet</span> with unique values in column <span class="inline_code">j</span>. It merges rows using a given <span class="inline_code">function</span> that takes a list of column values and returns a single value. Predefined functions are <span class="inline_code">FIRST</span>, <span class="inline_code">LAST</span>, <span class="inline_code">COUNT</span>, <span class="inline_code">MIN</span>, <span class="inline_code">MAX</span>, <span class="inline_code">SUM</span>, <span class="inline_code">AVG</span>, <span class="inline_code">STDEV</span> and <span class="inline_code">CONCATENATE</span>. It can also be a list of functions.</p>
<p>The optional <span class="inline_code">key</span> can be used to compare the values in column <span class="inline_code">j</span>. For example, <span class="inline_code">lambda</span> <span class="inline_code">date:</span> <span class="inline_code">date.year</span> groups a column of <span class="inline_code">Date</span> objects by year.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import Datasheet, pprint
&gt;&gt;&gt;
&gt;&gt;&gt; ds = Datasheet(rows=[
&gt;&gt;&gt; (1, u'Schrödinger', 'cat'),
&gt;&gt;&gt; (2, u'Hofstadter', 'cat'),
&gt;&gt;&gt; (3, u'Taxi', 'dog')
&gt;&gt;&gt; ])
&gt;&gt;&gt;
&gt;&gt;&gt; g = ds.copy(columns=[2, 0]) # A copy with type &amp; id.
&gt;&gt;&gt; g = g.group(0, COUNT) # Group type, count rows per type.
&gt;&gt;&gt; pprint(g, fill='')
cat 2
dog 1 </pre></div>
<h3>Sorting rows &amp; columns</h3>
<p><span class="inline_code">Datasheet.columns[j].sort()</span> sorts the rows according to the values in column <span class="inline_code">j</span>. <br /><span class="inline_code">Datasheet.columns.sort()</span> can be used to change the column order:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; ds.columns.sort(order=[0, 2, 1])
&gt;&gt;&gt; pprint(ds, fill='')
1 cat Schrödinger
2 cat Hofstadter
3 dog Taxi</pre></div>
<p><span class="inline_code">Datasheet.columns.swap(j1,j2)</span> swaps two individual columns with given indices.</p>
<h3><a name="csv"></a>CSV import &amp; export</h3>
<p><span class="inline_code">Datasheet.save()</span> exports the matrix as a CSV file. <span class="inline_code">Datasheet.load()</span> returns a <span class="inline_code">Datasheet</span> from a given CSV file. CSV (comma-separated values) is a simple text format for tabular data, where each line is a row and each value is separated by a comma.</p>
<pre class="brush:python; gutter:false; light:true;">datasheet = Datasheet.load(path, separator=',', decoder=lambda v: v, headers=False)</pre><pre class="brush:python; gutter:false; light:true;">datasheet.save(path, separator=',', encoder=lambda v: v, headers=False)</pre><p>On export, all&nbsp;<span class="inline_code">str</span>, <span class="inline_code">int</span>, <span class="inline_code">float</span>, <span class="inline_code">bool</span> and <span class="inline_code">Date</span> values are converted to Unicode. An <span class="inline_code">encoder</span> can be given for other data types. On import, all values in the datasheet will be Unicode unless a <span class="inline_code">decoder</span> is given.</p>
<p>With <span class="inline_code">headers=True</span>, the <span class="inline_code">Datasheet.fields</span> headers are exported and imported (first line in CSV). In this case, the data type for each column (<span class="inline_code">STRING</span>, <span class="inline_code">INTEGER</span>, <span class="inline_code">FLOAT</span>, <span class="inline_code">BOOLEAN</span> or <span class="inline_code">DATE</span>) is explicitly known and no <span class="inline_code">encoder</span> or <span class="inline_code">decoder</span> is needed.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import Datasheet, STRING, DATE, date
&gt;&gt;&gt;
&gt;&gt;&gt; ds = Datasheet(fields=(('name', STRING), ('date', DATE)))
&gt;&gt;&gt; ds.append((u'Schrödinger', date('1887-08-12')))
&gt;&gt;&gt; ds.append((u'Hofstadter', date('1945-02-15')))
&gt;&gt;&gt;
&gt;&gt;&gt; ds.save('pets.csv', headers=True)
&gt;&gt;&gt;
&gt;&gt;&gt; ds = Datasheet.load('pets.csv', headers=True)
&gt;&gt;&gt; print ds[0]
[u'Schrödinger', Date('1887-08-12 00:00:00')]
</pre></div>
<p>The <span class="inline_code">csv()</span> function can also be used instead of <span class="inline_code">Datasheet.load()</span>:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.db import csv
&gt;&gt;&gt;
&gt;&gt;&gt; for name, date in csv('pets.csv', separator=',', headers=True):
&gt;&gt;&gt; print name, date</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="date"></a>Date</h2>
<p>The <span class="inline_code">date()</span> function returns a new <span class="inline_code">Date</span>, a convenient subclass of Python's <span class="inline_code">datetime.datetime</span>. It takes an integer (Unix timestamp), a string or <span class="inline_code">NOW</span>. An optional string input format and output format can be given (e.g., <span class="inline_code">"%d/%m/%y"</span>). The default output format is <span class="inline_code">"YYYY-MM-DD hh:mm:ss"</span>.</p>
<pre class="brush:python; gutter:false; light:true;">d = date(int)</pre><pre class="brush:python; gutter:false; light:true;">d = date(NOW, format=DEFAULT)
</pre><pre class="brush:python; gutter:false; light:true;">d = date(string)</pre><pre class="brush:python; gutter:false; light:true;">d = date(string, format=DEFAULT)</pre><pre class="brush:python; gutter:false; light:true;">d = date(string, inputformat, format=DEFAULT)</pre><pre class="brush:python; gutter:false; light:true;">d = date(year, month, day, format=DEFAULT)</pre><pre class="brush:python; gutter:false; light:true;">d = date(year, month, day, hours, minutes, seconds, format=DEFAULT)</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">d.year
d.month # 1-12
d.week # 1-52
d.weekday # 1-7
d.day # 1-31
d.minute # 1-60
d.second # 1-60
d.timestamp # Seconds elapsed since 1/1/1970.</pre><p>If no string input format is given, a number of common formats will be tried:</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Format</span></td>
<td><span class="smallcaps">Example</span></td>
</tr>
<tr>
<td><span class="inline_code">%Y-%m-%d %H:%M:%S</span></td>
<td>2010-09-21 09:27:01</td>
</tr>
<tr>
<td><span class="inline_code">%a, %d %b %Y %H:%M:%S %z</span></td>
<td>Tue, 9 Sep 2010 17:58:28 +0000</td>
</tr>
<tr>
<td><span class="inline_code">%Y-%m-%dT%H:%M:%SZ</span></td>
<td>2010-09-20T09:27:01Z</td>
</tr>
<tr>
<td><span class="inline_code">%Y-%m-%dT%H:%M:%S+0000</span></td>
<td>2010-09-20T09:27:01+0000</td>
</tr>
<tr>
<td><span class="inline_code">%Y-%m-%d %H:%M</span></td>
<td>2010-09-20 09:27</td>
</tr>
<tr>
<td><span class="inline_code">%Y-%m-%d</span></td>
<td>2010-09-20</td>
</tr>
<tr>
<td><span class="inline_code">%d/%m/%Y</span></td>
<td>20/09/2010</td>
</tr>
<tr>
<td><span class="inline_code">%d %B %Y</span></td>
<td>9 september 2010</td>
</tr>
<tr>
<td><span class="inline_code">%B %d %Y</span></td>
<td>September 9 2010</td>
</tr>
<tr>
<td><span class="inline_code">%B %d, %Y</span></td>
<td>September 09, 2010</td>
</tr>
</tbody>
</table>
<p>All date formats used in <a class="link-maintenance" href="pattern-web.html">pattern.web</a> (e.g., Twitter search result) are automatically detected.<br />For an overview of date format syntax, see: <a href="http://docs.python.org/library/time.html#time.strftime" target="_blank">http://docs.python.org/library/time.html#time.strftime</a>.<br />&nbsp;</p>
<p><span class="smallcaps">Date calculations</span></p>
<p>The <span class="inline_code">time()</span> function can be used to add or subtract time to a <span class="inline_code">Date</span>:</p>
<pre class="brush:python; gutter:false; light:true;">time(days=0, seconds=0, minutes=0, hours=0)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.db import date, time
&gt;&gt;&gt;
&gt;&gt;&gt; d = date('23 august 2011')
&gt;&gt;&gt; d += time(days=2, hours=5)
&gt;&gt;&gt; print type(d)
&gt;&gt;&gt; print d
&gt;&gt;&gt; print d.year, d.month, d.day
&lt;class 'pattern.db.Date'&gt;
2011-08-25 05:00:00
2011, 8, 25 </pre></div>
<p>&nbsp;</p>
<hr />
<h2>See also</h2>
<ul>
<li><a href="http://www.cherrypy.org/" target="_blank">CherryPy</a> (BSD): o<span>bject-oriented HTTP framework for Python.</span></li>
<li><span><a href="https://www.djangoproject.com/" target="_blank">Django</a> (BSD): m</span><span>odel-view-controller framework for Python.</span></li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,416 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-de</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-de" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-de</a></div>
<h1>pattern.de</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1534" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">The pattern.de module contains a fast part-of-speech tagger for German (identifies nouns, adjectives, verbs, etc. in a sentence) and tools for German verb conjugation and noun singularization &amp; pluralization.</span></p>
<p>It can be used by itself or with other&nbsp;<a href="pattern.html">pattern</a>&nbsp;modules:&nbsp;<a href="pattern-web.html">web</a>&nbsp;|&nbsp;<a href="pattern-db.html">db</a>&nbsp;| <a href="pattern-en.html">en</a>&nbsp;|&nbsp;<a href="pattern-search.html">search</a>&nbsp;|&nbsp;<a href="pattern-vector.html">vector</a>&nbsp;|&nbsp;<a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema_de.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<p>The functions in this module take the same parameters and return the same values as their counterparts in <a href="pattern-en.html">pattern.en</a>. Refer to the documentation there for more details.&nbsp;</p>
<h3>Gender</h3>
<p>German nouns and adjectives inflect according to gender. The <span class="inline_code">gender()</span> function predicts the gender (<span class="inline_code">MALE</span>, <span class="inline_code">FEMALE</span>,&nbsp;<span class="inline_code">NEUTRAL</span>) of&nbsp;a given noun with about 75% accuracy:&nbsp;</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.de import gender, MALE, FEMALE, NEUTRAL
&gt;&gt;&gt; print gender('Katze')
FEMALE</pre></div>
<h3>Article</h3>
<p>The <span class="inline_code">article()</span> function returns the article (<span class="inline_code">INDEFINITE</span> or <span class="inline_code">DEFINITE</span>) inflected by gender and role (<span class="inline_code">SUBJECT</span>, <span class="inline_code">OBJECT</span>, <span class="inline_code">INDIRECT</span> or <span class="inline_code">PROPERTY</span>).&nbsp;In the following example,&nbsp;<span class="inline_code">role=OBJECT</span>&nbsp;means that the article is used in front of a noun that is the object of the sentence, as in: <em>Ich sehe <span style="text-decoration: underline;">die Katze</span></em> (<em>I see the cat</em> what do I see?&nbsp;→ the cat).</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.de import article, DEFINITE, FEMALE, OBJECT
&gt;&gt;&gt; print article('Katze', DEFINITE, gender=FEMALE, role=OBJECT)
die</pre></div>
<h3>Noun singularization &amp; pluralization</h3>
<p>For German nouns there is <span class="inline_code">singularize()</span> and <span class="inline_code">pluralize()</span>.&nbsp;The implementation uses a statistical approach with 84% accuracy for singularization and 72% for pluralization.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.de import singularize, pluralize
&gt;&gt;&gt; print singularize('Katzen')
&gt;&gt;&gt; print pluralize('Katze')
Katze
Katzen </pre></div>
<h3>Verb conjugation</h3>
<p>For German verbs there is <span class="inline_code">conjugate()</span>, <span class="inline_code">lemma()</span>, <span class="inline_code">lexeme()</span> and <span class="inline_code">tenses()</span>.&nbsp;The lexicon for verb conjugation contains about 2,000 common German verbs. For unknown verbs it will fall back to a rule-based approach with an accuracy of about 87%.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.de import conjugate
&gt;&gt;&gt; from pattern.de import INFINITIVE, PRESENT, SG, SUBJUNCTIVE
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('war', INFINITIVE)
&gt;&gt;&gt; print conjugate('war', PRESENT, 1, SG, mood=SUBJUNCTIVE)
sein
sei </pre></div>
<p>German verbs have more tenses than English verbs. In particular, the plural differs for each person and there are additional forms for the <span class="inline_code">IMPERATIVE</span> and <span class="inline_code">SUBJUNCTIVE</span> mood.&nbsp;The <span class="inline_code">conjugate()</span> function takes the following optional parameters:</p>
<table class="border">
<tbody>
<tr>
<td class="smallcaps">Tense</td>
<td class="smallcaps">Person</td>
<td class="smallcaps">Number</td>
<td class="smallcaps">Mood</td>
<td class="smallcaps">Aspect</td>
<td class="smallcaps">Alias</td>
<td class="smallcaps">Example</td>
</tr>
<tr>
<td class="inline_code">INFINITVE</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">"inf"</td>
<td><em>sein</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg"</td>
<td><em>ich <span style="text-decoration: underline;">bin</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg"</td>
<td><em>du <span style="text-decoration: underline;">bist</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg"</td>
<td><em>er <span style="text-decoration: underline;">ist</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl"</td>
<td><em>wir <span style="text-decoration: underline;">sind</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl"</td>
<td><em>ihr <span style="text-decoration: underline;">seid</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl"</td>
<td><em>sie <span style="text-decoration: underline;">sind</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"part"</td>
<td><em>seiend</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg!"</td>
<td><em>sei</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl!"</td>
<td><em>seien</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl!"</td>
<td><em>seid</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg?"</td>
<td><em>ich <span style="text-decoration: underline;">sei</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg?"</td>
<td><em>du <span style="text-decoration: underline;">seiest</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg?"</td>
<td><em>ihr <span style="text-decoration: underline;">sei</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl?"</td>
<td><em>wir <span style="text-decoration: underline;">seien</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl?"</td>
<td><em>ihr <span style="text-decoration: underline;">seiet</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl?"</td>
<td><em>sie <span style="text-decoration: underline;">seien</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp"</td>
<td><em>ich <span style="text-decoration: underline;">war</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp"</td>
<td><em>du <span style="text-decoration: underline;">warst</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp"</td>
<td><em>er <span style="text-decoration: underline;">war</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl"</td>
<td><em>wir <span style="text-decoration: underline;">waren</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl"</td>
<td><em>ihr <span style="text-decoration: underline;">wart</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl"</td>
<td><em>sie <span style="text-decoration: underline;">waren</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"ppart"</td>
<td><em>gewesen</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp?"</td>
<td><em>ich <span style="text-decoration: underline;">wäre</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp?"</td>
<td><em>du <span style="text-decoration: underline;">wärest</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp?"</td>
<td><em>er <span style="text-decoration: underline;">wäre</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl?"</td>
<td><em>wir <span style="text-decoration: underline;">wären</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl?"</td>
<td><em>ihr <span style="text-decoration: underline;">wäret</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl?"</td>
<td><em>sie <span style="text-decoration: underline;">wären</span></em></td>
</tr>
</tbody>
</table>
<p>Instead of optional parameters, a single short alias, or&nbsp;<span class="inline_code">PARTICIPLE</span> or <span class="inline_code">PAST+PARTICIPLE</span> can also be given. With no parameters, the infinitive form of the verb is returned.</p>
<h3>Attributive &amp; predicative adjectives&nbsp;</h3>
<p>German adjectives inflect with an <span class="inline_code">-e</span>,&nbsp;<span class="inline_code">-em</span>&nbsp;, <span class="inline_code">-en</span>, <span class="inline_code">-er</span>, or <span class="inline_code">-es</span> suffix (e.g., <em>neugierig</em>&nbsp;<em>die neugierige Katze</em>) depending on gender and role. You can get the base form with the <span class="inline_code">predicative()</span> function, or vice versa with&nbsp;<span class="inline_code">attributive()</span>.&nbsp;For predicative, a statistical approach is used with an accuracy of 98%. For attributive, you need to supply gender (<span class="inline_code">MALE</span>, <span class="inline_code">FEMALE</span>, <span class="inline_code">NEUTRAL</span>) and role (<span class="inline_code">SUBJECT</span>, <span class="inline_code">OBJECT</span>, <span class="inline_code">INDIRECT</span>, <span class="inline_code">PROPERTY</span>).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.de import attributive, predicative
&gt;&gt;&gt; from pattern.de import MALE, FEMALE, SUBJECT, OBJECT
&gt;&gt;&gt;
&gt;&gt;&gt; print predicative('neugierige')
&gt;&gt;&gt; print attributive('neugierig', gender=FEMALE)
&gt;&gt;&gt; print attributive('neugierig', gender=FEMALE, role=OBJECT)
&gt;&gt;&gt; print attributive('neugierig', gender=FEMALE, role=INDIRECT, article="die")
neugierig
neugierige
neugierige
neugierigen </pre></div>
<h3>Parser</h3>
<p>For parsing there is <span class="inline_code">parse()</span>, <span class="inline_code">parsetree()</span> and <span class="inline_code">split()</span>. The <span class="inline_code">parse()</span> function annotates words in the given string with their part-of-speech <a class="link-maintenance" href="mbsp-tags.html">tags</a> (e.g., <span class="postag">NN</span> for nouns and <span class="postag">VB</span> for verbs). The <span class="inline_code">parsetree()</span> function takes a string and returns a tree of nested objects (<span class="inline_code">Text</span><span class="inline_code">Sentence</span><span class="inline_code">Chunk</span><span class="inline_code">Word</span>). The <span class="inline_code">split()</span> function takes the output of <span class="inline_code">parse()</span> and returns a <span class="inline_code">Text</span>. See the pattern.en documentation (<a class="link-maintenance" href="pattern-en.html#tree">here</a>) how to manipulate <span class="inline_code">Text</span> objects.&nbsp;</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.de import parse, split
&gt;&gt;&gt;
&gt;&gt;&gt; s = parse('Die Katze liegt auf der Matte.')
&gt;&gt;&gt; for sentence in split(s):
&gt;&gt;&gt; print sentence
Sentence('Die/DT/B-NP/O Katze/NN/I-NP/O liegt/VB/B-VP/O'
'auf/IN/B-PP/B-PNP der/DT/B-NP/I-PNP Matte/NN/I-NP/I-PNP ././O/O')</pre></div>
<p>The parser is built on Gerold Schneider &amp; Martin Volk's&nbsp;<a href="http://www.zora.uzh.ch/28579/" target="_blank">German language model</a>.&nbsp;The accuracy is around 85%. The original <a href="http://www.fi.muni.cz/~xnemcik/nlp/sarrebrugge/handout.pdf" target="_self">STTS</a> tagset is mapped to <a href="mbsp-tags.html">Penn Treebank</a> tagset. If you need to work with the original tags you can also use&nbsp;<span class="inline_code">parse()</span> with an optional parameter <span class="inline_code">tagset="STTS"</span>.</p>
<p class="small"><span style="text-decoration: underline;">Reference</span>: Schneider, G. &amp; Volk, M. (1998). <br />Adding manual constraints and lexical look-up to a Brill-tagger for German. <em>Proceedings of ESSLLI-98</em>.&nbsp;</p>
<h3>Sentiment analysis</h3>
<p>There's no&nbsp;<span class="inline_code">sentiment()</span> function for German yet.</p>
<p class="small"><span style="text-decoration: underline;">Note</span>: We did a test by automatically assigning scores (<span class="inline_code">-1.0</span>&nbsp;→ +<span class="inline_code">1.0</span>) to adjectives translated from English, but this approach only had 35% accuracy.</p>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,367 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-dev</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-dev" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-dev</a></div>
<h1>pattern.dev</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1480" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">Pattern is a web mining module for the Python programming language.</span></p>
<p><span class="big">Pattern is written in Python with extensions in JavaScript. The source code is hosted on GitHub. It is licensed under BSD, so it can be freely incorporated in proprietary applications. Contributions and donations are welcomed.</span></p>
<p>There are six core modules in the <a href="pattern.html">pattern</a> package: <a href="pattern-web.html">web</a> | <a href="pattern-db.html">db</a> | <a href="pattern-text.html">text</a> | <a href="pattern-search.html">search</a> | <a href="pattern-vector.html">vector</a> | <a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Topics</h2>
<ul>
<li><a href="#contribute">Contributing</a></li>
<li><a href="#dependencies">Dependencies</a></li>
<li><a href="#documentation">Documentation</a></li>
<li><a href="#code">Coding conventions</a></li>
<li><a href="#quality">Code quality</a></li>
<li><a href="#language">Language support</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="contribute"></a>Contribute</h2>
<p>The source code is hosted on <a href="https://github.com/clips/pattern" target="_blank">GitHub</a> (see <a class="noexternal link-maintenance" href="http://www.github.com/clips/pattern" target="_blank">http://ithub.com/clips/pattern</a>). GitHub is an online project hosting service with version control. Version control tracks changes to the source code, i.e., it can be rolled back to an earlier state or merged with revisions from different contributors.</p>
<p>To work on Pattern, create a <a href="http://help.github.com/fork-a-repo/" target="_blank">fork</a> of the project, a local copy of the source code that can be edited and updated by you alone. You can manage this copy with the free GitHub application (<a class="noexternal link-maintenance" href="http://windows.github.com/" target="_blank">windows</a> | <a class="noexternal link-maintenance" href="http://mac.github.com/" target="_blank">mac</a>). When you are ready, send us a <a href="http://help.github.com/send-pull-requests/" target="_blank">pull</a> request and we will integrate your changes in the main project.</p>
<p>Let us know if you encounter a bug. We prefer if you create an <a href="https://github.com/clips/pattern/issues" target="_blank">issue</a> on GitHub, so that (until fixed) the problem is visible to all users of Pattern. There is a blue button for donations on the main documentation page. Please support the development if you use Pattern commercially.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="dependencies"></a>Dependencies</h2>
<p>There are six core modules in the package:</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Module</span></td>
<td><span class="smallcaps">Functionality</span></td>
</tr>
<tr>
<td>pattern.web</td>
<td>Asynchronous requests, web services, web crawler, HTML DOM parser.</td>
</tr>
<tr>
<td>pattern.db</td>
<td>Wrappers for databases (MySQL, SQLite) and CSV-files.</td>
</tr>
<tr>
<td>pattern.text</td>
<td>Base classes for parsers, parse trees and sentiment analysis.</td>
</tr>
<tr>
<td>pattern.search</td>
<td>Pattern matching algorithm for parsed text (syntax &amp; semantics).</td>
</tr>
<tr>
<td>pattern.vector</td>
<td>Vector space model, clustering, classification.</td>
</tr>
<tr>
<td>pattern.graph</td>
<td>Graph analysis &amp; visualization.</td>
</tr>
</tbody>
</table>
<p>There are two helper modules: pattern.metrics (statistics) and canvas.js (visualization).</p>
<h3>Design philosophy</h3>
<p>Pattern is written in Python, with JavaScript extensions for data visualization (graph.js and canvas.js). The package works out of the box. If C/C++ code is bundled for performance (e.g., LIBSVM), it includes precompiled binaries for all major platforms (Windows, Linux, Mac).</p>
<p>Pattern modules are standalone. If a module imports another module, it fails silently if that module is not present. For example, pattern.text implements a parser that uses a Perceptron language model when pattern.vector is present, but falls back to a lexicon of known words and rules for unknown words if used by itself. A single module can have a lot of interdependent classes, hence the large __init.__.py files.</p>
<p>Pattern modules can bundle other BSD-licensed Python projects (e.g., BeautifulSoup). For larger projects or GPL-licensed projects, it provides code to map data structures.</p>
<h3>Base classes</h3>
<p>In pattern.web, each web service (e.g., Google, Twitter) inherits from <span class="inline_code">SearchEngine</span> and returns <span class="inline_code">Result</span> objects. Each MediaWiki web service (e.g., Wikipedia, Wiktionary) inherits from <span class="inline_code">MediaWiki</span>.</p>
<p>In pattern.db, each database engine is wrapped by <span class="inline_code">Database</span>. It supports MySQL and SQLite, with future plans for MongoDB. See <span class="inline_code">Database</span><span class="inline_code">.connect()</span>, <span class="inline_code">escape()</span>, <span class="inline_code">_field_SQL()</span> and <span class="inline_code">_update()</span>.</p>
<p>In pattern.text, each language inherits from <span class="inline_code">Parser</span>, having a lexicon of known words and an optional language model. Case studies for <a class="link-maintenance" href="http://www.clips.ua.ac.be/pages/using-wikicorpus-nltk-to-build-a-spanish-part-of-speech-tagger">Spanish</a> and <a class="link-maintenance" href="http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger">Italian</a> show how to train a <span class="inline_code">Lexicon</span>. A bundled pattern.vector example shows how to train a Perceptron <span class="inline_code">Model</span>.</p>
<p>In pattern.vector, each classifier inherits from <span class="inline_code">Classifier</span> (e.g., KNN, SVM). Each clustering algorithm is available from <span class="inline_code">Model.cluster()</span>.</p>
<p>In pattern.graph, subclasses of <span class="inline_code">Node</span> or <span class="inline_code">Edge</span> can be used with (subclasses of) <span class="inline_code">Graph</span> by setting the <span class="inline_code">base</span> parameter of <span class="inline_code">Graph.add_node()</span> and <span class="inline_code">add_edge()</span>. Each layout algorithm (e.g., force-based springs) inherits from <span class="inline_code">GraphLayout</span>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="documentation"></a>Documentation</h2>
<p>Each function or method has a docstring:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">def find(match=lambda item: False, list=[]):
""" Returns the first item in the given list for which match(item) is True.
"""
for item in list:
if match(item) is True:
return item</pre></div>
<p>The docstring provides a concise description of the type of input and output. In Pattern, a docstrings starts with "Returns" (for a function) or "Yields" (for a property). Each function has a unit test, to verify that it is fit for use. Each function has an engaging example, bundled in the package or in the documentation.</p>
<p>Pattern does not have a documentation framework. The documentation is written by hand and in constant revision. Please report spelling errors and examples with bugs.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="code"></a>Coding conventions</h2>
<h3>Whitespace</h3>
<p>The source code is not strict <a href="http://www.python.org/dev/peps/pep-0008/" target="_blank">PEP8</a>. For example, additional whitespace is used so that property assignments or inline comments are vertically aligned as a block:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">class Table(object):
def __init__(self, name, database):
""" A collection of rows with one or more fields of a certain type.
"""
self.database = database
self.name = name
self.fields = [] # List of field names (i.e., column names).
self.schema = {} # Dictionary of (field, Schema)-items.
self.default = {} # Default values for Table.insert().
self.primary_key = None
self._update()</pre></div>
<p>Whitespace is sometimes used to align dictionary keys and values:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">url = URL('http://search.twitter.com/search.json?', method=GET, query={
'q': query,
'page': start,
'rpp': min(count, 100)
})</pre></div>
<h3>Class and function names</h3>
<p>Single words are preferred for class names. Compound terms use CamelCase, e.g., <span class="inline_code">SearchEngine</span> or <span class="inline_code">AsynchronousRequest</span>. Single, descriptive words are preferred for functions and methods. Compound terms use lowercase_with_underscore. If a method takes no arguments, it is a property:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">class AsynchronousRequest:
@property
def done(self):
return not self._thread.isAlive() # We'd prefer "_thread.alive".</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">while not request.done:
... </pre></div>
<h3>Variable names</h3>
<p>The source code uses single character names abundantly. For example, dictionary <span style="text-decoration: underline;">k</span>eys and <span style="text-decoration: underline;">v</span>alues are <span class="inline_code">k</span> and <span class="inline_code">v</span>, a string is <span class="inline_code">s</span>. This is done to make the structure of the algorithm stand out (i.e., the actual function and method calls):</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">def normalize(s, punctuation='!?.:;,()[] '):
s = s.decode('utf-8')
s = s.lower()
s = s.strip(punctuation)
return s</pre></div>
<p>Frequently used single character variable names:</p>
<table class="border">
<tbody>
<tr>
<td style="text-align: center;"><span class="smallcaps">Variable</span></td>
<td><span class="smallcaps">Meaning</span></td>
<td><span class="smallcaps">Example</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">a</span></td>
<td>array, all</td>
<td><span class="inline_code">a = [normalize(w) for w in words]</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">b</span></td>
<td>boolean</td>
<td><span class="inline_code">while b is False:</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">d</span></td>
<td>distance, document</td>
<td><span class="inline_code">d = distance(v1, v2)</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">e</span></td>
<td>element</td>
<td><span class="inline_code">e = html.find('#nav')</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">f</span></td>
<td>file, filter, function</td>
<td><span class="inline_code">f = open('data.csv', 'r')</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">i</span></td>
<td>index</td>
<td><span class="inline_code">for i in range(len(matrix)):</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">j</span></td>
<td>index</td>
<td><span class="inline_code">for j in range(len(matrix[i])):</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">k</span></td>
<td>key</td>
<td><span class="inline_code">for k in vector.keys():</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">n</span></td>
<td>list length</td>
<td><span class="inline_code">n = len(a)</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">p</span></td>
<td>parser, pattern</td>
<td><span class="inline_code">p = pattern.search.compile('NN')</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">q</span></td>
<td>query</td>
<td><span class="inline_code">for r in twitter.search(q):</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">r</span></td>
<td>result, row</td>
<td><span class="inline_code">for r in csv('data.csv):</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">s</span></td>
<td>string</td>
<td><span class="inline_code">s = s.decode('utf-8').strip()</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">t</span></td>
<td>time</td>
<td><span class="inline_code">t = time.time() - t0</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">v</span></td>
<td>value, vector</td>
<td><span class="inline_code">for k, v in vector.items():</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">w</span></td>
<td>word</td>
<td><span class="inline_code">for i, w in enumerate(sentence.words):</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">x</span></td>
<td>horizontal position</td>
<td><span class="inline_code">node.x = 0</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">y</span></td>
<td>vertical position</td>
<td><span class="inline_code">node.y = 0</span></td>
</tr>
</tbody>
</table>
<h3>Dictionaries</h3>
<p>The source code uses dictionaries abundantly. Dictionaries are fast for lookup. For example, pattern.vector represents vectors as sparse feature&nbsp;→ weight dictionaries:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">v1 = document1.vector
v2 = document2.vector
cos = sum(v1.get(w,0) * f for w, f in v2.items()) / (norm(v1) * norm(v2) or 1)</pre></div>
<p>Pattern algorithms are <a class="link-maintenance" href="pattern-metrics.html#profile">profiled</a> and optimized with caching mechanisms.</p>
<h3>List comprehensions</h3>
<p>The source code uses list comprehension abundantly. It is concise, and often faster than <span class="inline_code">map()</span>. However, it can also be harder to read (a comment should be added).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">def words(s, punctuation='!?.:;,()[] '):
return [w.strip(punctuation) for w in s.split()]
</pre></div>
<h3>Ternary operator</h3>
<p>Previous versions of Pattern supported Python 2.4, which does have the ternary operator (single-line if). A part of the source code still uses a boolean condition to emulate it:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">s = s.lower() if lowercase is True else s # Python 2.5+</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">s = lowercase is True and s.lower() or s # Python 2.4</pre></div>
<p>With boolean conditions, care must be taken for values <span class="inline_code">0</span>, <span class="inline_code">''</span>, <span class="inline_code">[]</span>, <span class="inline_code">()</span>, <span class="inline_code">{}</span>, and <span class="inline_code">None</span>, since they evaluate as&nbsp;<span class="inline_code">False</span> and trigger the or-clause.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="quality"></a>Code quality</h2>
<p>The source code has about 25,000 lines of Python code (25% unit tests), 5,000 lines of JavaScript, and 20,000 lines of bundled dependencies (BeautifulSoup, PDFMiner, PyWordNet, LIBSVM, LIBLINEAR, etc.). To evaluate the code quality,&nbsp;<a href="http://www.logilab.org/857" target="_blank">pylint</a>&nbsp;can be used:</p>
<div class="install">
<pre class="gutter:false; light:true;">&gt; cd pattern-2.x
&gt; pylint pattern --rcfile=.pylintrc</pre></div>
<p>Important pylint id's are those starting with <span class="inline_code">E</span> (= possible bugs).</p>
<p>The&nbsp;<span class="inline_code">.pylintrc</span>&nbsp;configuration file defines a number of custom settings:</p>
<ul>
<li>Instead of 80 characters per line, a 100 characters are allowed.</li>
<li>Ignore pylint id <span class="inline_code">C0103</span>, single-character variable names are allowed.</li>
<li>Ignore pylint id <span class="inline_code">W0142</span>,&nbsp;<span class="inline_code">*args</span> and <span class="inline_code">**kwargs</span> are allowed.</li>
<li>Ignore bundled dependencies.</li>
</ul>
<p>The source code scores about 7.38 / 10. A known issue is the absence of docstrings in unit tests.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="language"></a>Language support</h2>
<p>Pattern currently has natural language processing tools (e.g., pattern.en, pattern.es) for most languages on the to-do list.&nbsp;There is no sentiment analysis yet for Spanish and German. Chinese is an open task.</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Language</span></td>
<td style="text-align: center;"><span class="smallcaps">Code</span></td>
<td style="text-align: center;"><span class="smallcaps">Speakers</span></td>
<td><span class="smallcaps">Example countries</span></td>
</tr>
<tr>
<td>Mandarin</td>
<td style="text-align: center;"><span class="inline_code">cmn</span></td>
<td style="text-align: center;">955M</td>
<td>China + Taiwan (945), Singapore (3)</td>
</tr>
<tr>
<td><s>Spanish</s></td>
<td style="text-align: center;"><span class="inline_code">es</span></td>
<td style="text-align: center;">350M</td>
<td>Argentina (40), Colombia (40), Mexico (100), Spain (45)</td>
</tr>
<tr>
<td><s>English</s></td>
<td style="text-align: center;"><span class="inline_code">en</span></td>
<td style="text-align: center;">340M</td>
<td>Canada (30), United Kingdom (60), United States (300)</td>
</tr>
<tr>
<td><s>German</s></td>
<td style="text-align: center;"><span class="inline_code">de</span></td>
<td style="text-align: center;">100M</td>
<td>Austria (10), Germany (80), Switzerland (7)</td>
</tr>
<tr>
<td><s>French</s></td>
<td style="text-align: center;"><span class="inline_code">fr</span></td>
<td style="text-align: center;">70M</td>
<td>France (65), Côte d'Ivoire (20)</td>
</tr>
<tr>
<td><s>Italian</s></td>
<td style="text-align: center;"><span class="inline_code">it</span></td>
<td style="text-align: center;">60M</td>
<td>Italy (60)</td>
</tr>
<tr>
<td><s>Dutch</s></td>
<td style="text-align: center;"><span class="inline_code">nl</span></td>
<td style="text-align: center;">25M</td>
<td>The Netherlands (25), Belgium (5), Suriname (1)</td>
</tr>
</tbody>
</table>
<p>There are two case studies that demonstrate how to build a pattern.xx language module:</p>
<ul>
<li><a href="http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger">Using Wikitionary to build an Italian part-of-speech tagger</a></li>
<li><a href="http://www.clips.ua.ac.be/pages/using-wikicorpus-nltk-to-build-a-spanish-part-of-speech-tagger">Using Wikicorpus &amp; NLTK to build a Spanish part-of-speech tagger</a></li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,733 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-en</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-en" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-en</a></div>
<h1>pattern.en</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1383" class="node node-type-page"><div class="node-inner">
<div class="content">
<p class="big">The pattern.en module contains a fast part-of-speech tagger for English (identifies nouns, adjectives, verbs, etc. in a sentence), sentiment analysis, tools for English verb conjugation and noun singularization &amp; pluralization, and a WordNet interface.</p>
<p>It can be used by itself or with other <a href="pattern.html">pattern</a> modules: <a href="pattern-web.html">web</a> | <a href="pattern-db.html">db</a>&nbsp;| en | <a href="pattern-search.html">search</a> | <a href="pattern-vector.html">vector</a> | <a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<ul>
<li><a href="#article">Indefinite article</a></li>
<li><a href="#pluralization">Pluralization + singularization</a></li>
<li><a href="#comparative">Comparative + superlative</a></li>
<li><a href="#conjugation">Verb conjugation</a></li>
<li><a href="#quantify">Quantification</a></li>
<li><a href="#spelling">Spelling</a></li>
<li><a href="#ngram">n-grams</a></li>
<li><a href="#parser">Parser</a>&nbsp;<span class="smallcaps link-maintenance">(tokenizer, tagger, chunker)</span></li>
<li><a href="#tree">Parse trees</a></li>
<li><a href="#sentiment">Sentiment</a></li>
<li><a href="#modality">Mood &amp; modality</a></li>
<li><a href="#wordnet">WordNet</a></li>
<li><a href="#wordlist">Wordlists</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="article"></a>Indefinite article</h2>
<p>The article is the most common determiner (<span class="postag">DT</span>) in English. It defines whether the successive noun is definite (<em><span style="text-decoration: underline;">the</span> cat</em>) or indefinite (<em><span style="text-decoration: underline;">a</span> cat</em>). The definite article is always <em>the</em>. The indefinite article can be&nbsp;<em>a</em> or <em>an</em>&nbsp;depending on how the successive noun is pronounced.</p>
<pre class="brush:python; gutter:false; light:true;">article(word, function=INDEFINITE) # DEFINITE | INDEFINITE</pre><pre class="brush:python; gutter:false; light:true;">referenced(word, article=INDEFINITE) # Returns article + word.
</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import referenced
&gt;&gt;&gt;
&gt;&gt;&gt; print referenced('university')
&gt;&gt;&gt; print referenced('hour')
a university
an hour</pre></div>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: Granger, M. (2006). <em>Ruby Linguistics Framework</em>, </span><span class="small">http://deveiate.org/projects/Linguistics</span></p>
<p>&nbsp;</p>
<hr />
<h2><a name="pluralization"></a>Pluralization + singularization</h2>
<p>The <span class="inline_code">pluralize()</span> function returns the plural form of a singular noun. The <span class="inline_code">singularize()</span> function returns the singular form of a plural noun. The <span class="inline_code">pos</span> parameter (part-of-speech) can be set to <span class="inline_code">NOUN</span> or <span class="inline_code">ADJECTIVE</span>, but only a small number of possessive adjectives inflect (e.g. <em>my</em><em>our</em>). The <span class="inline_code">custom</span> dictionary is for user-defined replacements. Accuracy of the algorithms is 96%.</p>
<pre class="brush:python; gutter:false; light:true;">pluralize(word, pos=NOUN, custom={}, classical=True)</pre><pre class="brush:python; gutter:false; light:true;">singularize(word, pos=NOUN, custom={})</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import pluralize, singularize
&gt;&gt;&gt;
&gt;&gt;&gt; print pluralize('child')
&gt;&gt;&gt; print singularize('wolves')
children
wolf
</pre></div>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: <br />Conway, D. (1998). An Algorithmic Approach to English Pluralization. <em>Proceedings of the 2nd Perl conference</em>.<br />Ferrer, B. (2005). <em>Inflector for Python</em>, http://www.bermi.org/projects/inflector</span></p>
<p>&nbsp;</p>
<hr />
<h2><a name="comparative"></a>Comparative + superlative</h2>
<p>The <span class="inline_code">comparative()</span> and <span class="inline_code">superlative()</span> functions give the comparative or superlative form of an adjective. Words with three or more syllables (e.g., <em>fantastic</em>) are simply preceded by <em>more</em> or <em>most</em>.</p>
<pre class="brush:python; gutter:false; light:true;">comparative(adjective) # big =&gt; bigger</pre><pre class="brush:python; gutter:false; light:true;">superlative(adjective) # big =&gt; biggest</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import comparative, superlative
&gt;&gt;&gt;
&gt;&gt;&gt; print comparative('bad')
&gt;&gt;&gt; print superlative('bad')
worse
worst
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="conjugation"></a>Verb conjugation</h2>
<p>The pattern.en module has a lexicon of 8,500 common English verbs and their conjugated forms (infinitive, 3rd singular present, present participle, past and past participle verbs such as <em>be</em>&nbsp;may have more forms). Some verbs can also be negated, including&nbsp;<em>be</em>, <em>can</em>, <em>do</em>, <em>will</em>, <em>must</em>, <em>have</em>, <em>may</em>, <em>need</em>, <em>dare</em>, <em>ought</em>.</p>
<pre class="brush:python; gutter:false; light:true;">conjugate(verb,
tense = PRESENT, # INFINITIVE, PRESENT, PAST, FUTURE
person = 3, # 1, 2, 3 or None
number = SINGULAR, # SG, PL
mood = INDICATIVE, # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE
aspect = IMPERFECTIVE, # IMPERFECTIVE, PERFECTIVE, PROGRESSIVE
negated = False, # True or False
parse = True) </pre><pre class="brush:python; gutter:false; light:true;">lemma(verb) # Base form, e.g., are =&gt; be.</pre><pre class="brush:python; gutter:false; light:true;">lexeme(verb) # List of possible forms: be =&gt; is, was, ...</pre><pre class="brush:python; gutter:false; light:true;">tenses(verb) # List of possible tenses of the given form.
</pre><p>The&nbsp;<span class="inline_code">conjugate()</span> function takes the following optional parameters:</p>
<table class="border">
<tbody>
<tr>
<td style="text-align: left;"><span class="smallcaps">Tense</span></td>
<td style="text-align: left;"><span class="smallcaps">Person</span></td>
<td style="text-align: left;"><span class="smallcaps">Number</span></td>
<td style="text-align: left;"><span class="smallcaps">Mood</span></td>
<td style="text-align: left;"><span class="smallcaps">Aspect</span></td>
<td style="text-align: left;"><span class="smallcaps">Alias</span></td>
<td style="text-align: center;"><span class="smallcaps">Tag</span></td>
<td style="text-align: left;"><span class="smallcaps">Example</span></td>
</tr>
<tr>
<td><span class="inline_code">INFINITIVE</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">"inf"</span></td>
<td style="text-align: center;"><span class="postag">VB</span></td>
<td><em>be</em></td>
</tr>
<tr>
<td><span class="inline_code">PRESENT</span></td>
<td><span class="inline_code">1</span></td>
<td><span class="inline_code">SG</span></td>
<td><span class="inline_code">INDICATIVE</span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"1sg"</span></td>
<td style="text-align: center;"><span class="postag">VBP</span></td>
<td><em>I <span style="text-decoration: underline;">am</span></em></td>
</tr>
<tr>
<td><span class="inline_code">PRESENT</span></td>
<td><span class="inline_code">2</span></td>
<td><span class="inline_code">SG</span></td>
<td><span class="inline_code">INDICATIVE</span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"2sg"</span></td>
<td style="text-align: center;">&nbsp;·</td>
<td><em>you <span style="text-decoration: underline;">are</span></em></td>
</tr>
<tr>
<td><span class="inline_code">PRESENT</span></td>
<td><span class="inline_code">3</span></td>
<td><span class="inline_code">SG</span></td>
<td><span class="inline_code">INDICATIVE</span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"3sg"</span></td>
<td style="text-align: center;"><span class="postag">VBZ</span></td>
<td><em>he <span style="text-decoration: underline;">is</span></em></td>
</tr>
<tr>
<td><span class="inline_code">PRESENT</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">PL</span></td>
<td><span class="inline_code">INDICATIVE</span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"pl"</span></td>
<td style="text-align: center;">&nbsp;·</td>
<td><em>are</em></td>
</tr>
<tr>
<td><span class="inline_code">PRESENT</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">INDICATIVE</span></td>
<td><span class="inline_code">PROGRESSIVE</span></td>
<td><span class="inline_code">"part"</span></td>
<td style="text-align: center;"><span class="postag">VBG</span></td>
<td><em>being</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">PAST</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">None</span></td>
<td><span class="inline_code">"p"</span></td>
<td style="text-align: center;"><span class="postag">VBD</span></td>
<td><em>were</em></td>
</tr>
<tr>
<td><span class="inline_code">PAST</span></td>
<td><span class="inline_code"><span>1</span></span></td>
<td><span class="inline_code"><span>PL</span></span></td>
<td><span class="inline_code">INDICATIVE</span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"1sgp"</span></td>
<td style="text-align: center;">&nbsp;·</td>
<td><em>I <span style="text-decoration: underline;">was</span></em></td>
</tr>
<tr>
<td><span class="inline_code">PAST</span></td>
<td><span class="inline_code"><span>2</span></span></td>
<td><span class="inline_code"><span>PL</span></span></td>
<td><span class="inline_code"><span>INDICATIVE</span></span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"2sgp"</span></td>
<td style="text-align: center;">&nbsp;·</td>
<td><em>you <span style="text-decoration: underline;">were</span></em></td>
</tr>
<tr>
<td><span class="inline_code">PAST</span></td>
<td><span class="inline_code"><span>3</span></span></td>
<td><span class="inline_code"><span>PL</span></span></td>
<td><span class="inline_code"><span>INDICATIVE</span></span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"3gp"</span></td>
<td style="text-align: center;">&nbsp;·</td>
<td><em>he <span style="text-decoration: underline;">was</span></em></td>
</tr>
<tr>
<td><span class="inline_code">PAST</span></td>
<td><span class="inline_code"><span>None</span></span></td>
<td><span class="inline_code"><span>PL</span></span></td>
<td><span class="inline_code"><span>INDICATIVE</span></span></td>
<td><span class="inline_code">IMPERFECTIVE</span></td>
<td><span class="inline_code">"ppl"</span></td>
<td style="text-align: center;">&nbsp;·</td>
<td><em>were</em></td>
</tr>
<tr>
<td style="text-align: left;"><span class="inline_code">PAST</span></td>
<td style="text-align: left;"><span><span>None</span></span></td>
<td style="text-align: left;"><span class="inline_code">None</span></td>
<td style="text-align: left;"><span class="inline_code">INDICATIVE</span></td>
<td style="text-align: left;"><span class="inline_code"><span>PROGRESSIVE</span></span></td>
<td style="text-align: left;"><span class="inline_code">"ppart"</span></td>
<td style="text-align: center;"><span class="postag">VBN</span></td>
<td style="text-align: left;"><em>been</em></td>
</tr>
</tbody>
</table>
<p>Instead of optional parameters, a single short alias, the part-of-speech tag, or&nbsp;<span class="inline_code">PARTICIPLE</span>&nbsp;or <span class="inline_code">PAST+PARTICIPLE</span> can also be given. With no parameters, the infinitive form of the verb is returned.</p>
<p>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import conjugate, lemma, lexeme
&gt;&gt;&gt;
&gt;&gt;&gt; print lexeme('purr')
&gt;&gt;&gt; print lemma('purring')
&gt;&gt;&gt; print conjugate('purred', '3sg') # he / she / it
['purr', 'purrs', 'purring', 'purred']
purr
purrs
</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import tenses, PAST, PL
&gt;&gt;&gt;
&gt;&gt;&gt; print 'p' in tenses('purred') # By alias.
&gt;&gt;&gt; print PAST in tenses('purred')
&gt;&gt;&gt; print (PAST, 1, PL) in tenses('purred')
True
True
True </pre></div>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: <em>XTAG English morphology</em> (1999), University of Pennsylvania, http://www.cis.upenn.edu/~xtag</span></p>
<p>&nbsp;<br /><span class="smallcaps">Rule-based conjugation</span></p>
<p>All verb functions have an optional <span class="inline_code">parse</span>&nbsp;parameter (<span class="inline_code">True</span> by default) that enables a rule-based parser for unknown verbs. This will not work for irregular verbs, and it is fragile for verbs ending in -e in the past tense, or the present participle. The overall accuracy of the algorithm is 91%.</p>
<p>With <span class="inline_code">parse=False</span>,&nbsp;<span class="inline_code">conjugate()</span>&nbsp;and&nbsp;<span class="inline_code">lemma()</span>&nbsp;yield&nbsp;<span class="inline_code">None</span>:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import verbs, conjugate, PARTICIPLE
&gt;&gt;&gt;
&gt;&gt;&gt; print 'google' in verbs.infinitives
&gt;&gt;&gt; print 'googled' in verbs.inflections
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('googled', tense=PARTICIPLE, parse=False)
&gt;&gt;&gt; print conjugate('googled', tense=PARTICIPLE, parse=True)
False
False
None
googling
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="quantify"></a>Quantification</h2>
<p>The <span class="inline_code">number()</span> function returns a <span class="inline_code">float</span> or <span class="inline_code">int</span> parsed from the given (numeric) string. If no number can be parsed from the string, it returns <span class="inline_code">0</span>.</p>
<p>The <span class="inline_code">numerals()</span> function returns the given <span class="inline_code">int</span> or <span class="inline_code">float</span> as a string of numerals. By default, the fraction is rounded to two decimals.</p>
<p>The <span class="inline_code">quantify()</span> function returns a word count approximation. Two similar words are a <em>pair</em>, three to eight <em>several</em>, and so on. Words can be given as a list, a word → count dictionary, or as a single word + amount.</p>
<p>The <span class="inline_code">reflect()</span> function quantifies Python objects see the examples bundled with the module.</p>
<pre class="brush:python; gutter:false; light:true;">number(string) # "seventy-five point two" =&gt; 75.2</pre><pre class="brush:python; gutter:false; light:true;">numerals(n, round=2) # 2.245 =&gt; "two point twenty-five"</pre><pre class="brush:python; gutter:false; light:true;">quantify([word1, word2, ...], plural={})</pre><pre class="brush:python; gutter:false; light:true;">reflect(object, quantify=True, replace=[])
</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import quantify
&gt;&gt;&gt;
&gt;&gt;&gt; print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
&gt;&gt;&gt; print quantify({'carrot': 100, 'parrot': 20})
&gt;&gt;&gt; print quantify('carrot', amount=1000)
several chickens, a pair of geese and a duck
dozens of carrots and a score of parrots
hundreds of carrots
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="spelling"></a>Spelling</h2>
<p>The <span class="inline_code">suggest()</span> function returns a list of spelling suggestions for a given word. Each suggestion is a <span class="inline_code">(word,</span> <span class="inline_code">confidence)</span>-tuple. It is about 70% accurate.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">suggest(string)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.en import suggest
&gt;&gt;&gt; print suggest("parot")
[("part", 0.99), ("parrot", 0.01)]</pre></div>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: Norvig, P. (2007). <em>How to Write a Spelling Corrector</em>. http://norvig.com/spell-correct.html</span>&nbsp;</p>
<p>&nbsp;</p>
<hr />
<h2><em><a name="ngram"></a>n</em>-grams</h2>
<p>The <span class="inline_code">ngrams()</span> function returns&nbsp;a list of <em>n</em>-grams (i.e., tuples of <em>n</em> successive words) from the given string.&nbsp;Alternatively, you can supply a <span class="inline_code">Text</span> or <span class="inline_code">Sentence</span> object (see further). Punctuation marks are stripped from words, and&nbsp;<em>n</em>-grams will not run over sentence delimiters (i.e., .!?), unless <span class="inline_code">continuous</span> is <span class="inline_code">True</span>.</p>
<pre class="brush:python; gutter:false; light:true;">ngrams(string, n=3, punctuation=".,;:!?()[]{}`''\"@#$^&amp;*+-|=~_", continuous=False)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import ngrams
&gt;&gt;&gt; print ngrams("I am eating pizza.", n=2) # bigrams
[('I', 'am'), ('am', 'eating'), ('eating', 'pizza')] </pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="parser"></a>Parser</h2>
<p>A parser identifies sentences, words and word types in a string of text. This involves tokenization (distinguishing between abbreviations and sentence breaks), part-of-speech tagging (annotating words with their type, e.g., is <em>can</em> a <span class="postag">noun</span> or a <span class="postag">verb</span>?) and chunking (grouping consecutive words that belong together). Parsing can be used to answer questions such as <em>who did what and why</em> and is useful in a wide range of text mining applications.&nbsp;The pattern.en parser uses a lexicon of a 100,000 known words and their part-of-speech <a class="link-maintenance" href="MBSP-tags.html" target="_blank">tag</a>, along with rules for unknown words based on word suffix (e.g., <em>-ly</em> = <span class="postag">ADVERB</span>) and context (surrounding words). This approach is fast but not always accurate, since many words are ambiguous and hard to capture with simple rules. The overall accuracy is about 95% (95.8% on WSJ portions 22-24). It is lower for informal language use (e.g., chat language).</p>
<p>The <span class="inline_code">parse()</span> function takes a string of text and returns a part-of-speech tagged Unicode string. Sentences in the output are separated by newline characters.</p>
<pre class="brush:python; gutter:false; light:true;">parse(string,
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = True, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate =&gt; eat)
encoding = 'utf-8' # Input string encoding.
tagset = None) # Penn Treebank II (default) or UNIVERSAL.
</pre><p>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import parse
&gt;&gt;&gt; print parse('I eat pizza with a fork.')
I/PRP/B-NP/O eat/VBD/B-VP/O pizza/NN/B-NP/O with/IN/B-PP/B-PNP a/DT/B-NP/I-PNP
fork/NN/I-NP/I-PNP ././O/O
</pre></div>
<ul>
<li>With&nbsp;<span class="inline_code">tags</span><span class="inline_code">=True</span> each word is annotated with a part-of-speech tag.&nbsp;</li>
<li>With <span class="inline_code">chunks=True</span>&nbsp;each word is annotated with a chunk tag and a&nbsp;<span class="postag">PNP</span> tag (prepositional noun phrase, <span class="postag">PP</span> + <span class="postag">NP</span>). The <span class="inline_code postag">O</span> tag (= outside) means that the word is not part of a chunk.</li>
<li>With <span class="inline_code">relations=True</span>&nbsp;each word is annotated with a role tag (e.g., <span class="postag">-SBJ</span>&nbsp;for subject or -<span class="postag">OBJ</span>&nbsp;for).</li>
<li>With <span class="inline_code">lemmata=True</span> each word is annotated with its base form.&nbsp;</li>
<li>With <span class="inline_code">tokenize=False</span>, punctuation marks will not be separated from words. <br />The input string is expected to be tokenized beforehand, or sentence delimiters are not discovered.</li>
</ul>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: Brill, E. (1992). <em>A simple rule-based part of speech tagger.</em> ANLC '92 Proceedings.</span></p>
<h3>Parser tags</h3>
<p>Let's examine the word <em>fork</em> and the tags assigned by the parser in the example above:</p>
<table class="border">
<tbody>
<tr>
<td class="smallcaps" style="text-align: center;" align="center">word</td>
<td class="smallcaps" style="text-align: center;" align="center">part-of-speech</td>
<td class="smallcaps" style="text-align: center;" align="center">chunk</td>
<td class="smallcaps" style="text-align: center;" align="center">pnp</td>
</tr>
<tr>
<td align="center">fork</td>
<td align="center"><span class="postag">NN </span></td>
<td align="center"><span class="postag">I-NP</span></td>
<td align="center"><span class="postag">I-PNP</span></td>
</tr>
</tbody>
</table>
<p>The word's part-of-speech tag is <span class="postag">NN</span>, which means that it is a noun. The word occurs in a <span class="postag">NP</span> chunk, a noun phrase (i.e., <em>a fork</em>). It is also part of a prepositional noun phrase (i.e., <em><span style="text-decoration: underline;">with</span> a fork</em>).</p>
<p>Common part-of-speech tags are&nbsp;<span class="postag">NN</span> (noun), <span class="postag">VB</span> (verb),&nbsp;<span class="postag">JJ</span> (adjective), <span class="postag">RB</span> (adverb)&nbsp;and&nbsp;<span class="postag">IN</span> (preposition).<br />Common chunk tags are&nbsp;<span class="postag">NP</span> (noun phrase) and <span class="postag">VP</span> (verb phrase).<br />Common chunk relations are <span class="postag">NP-SBJ</span> (subject) and <span class="postag">NP-OBJ</span> (object).</p>
<p>The <a class="link-maintenance" href="MBSP-tags.html" target="_blank">Penn Treebank II tagset</a>&nbsp;gives an overview of all the possible tags generated by the parser.</p>
<h3>Parser tagger &amp; tokenizer</h3>
<p>The <span class="inline_code">tokenize()</span> function returns a list of sentences, with punctuation marks split from words. It takes an optional&nbsp;<span class="inline_code">replace</span>&nbsp;dictionary, by default used to split contractions, i.e.,&nbsp;<span class="inline_code">{"'ve":</span>&nbsp;<span class="inline_code">"&nbsp;</span><span class="inline_code">'ve"</span><span class="inline_code">,</span> <span class="inline_code">...}</span>.</p>
<p>The <span class="inline_code">tag()</span> function simply annotates words with their part-of-speech tag and returns a list of <span class="inline_code">(word,</span> <span class="inline_code">tag)</span>-tuples:</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">tokenize(string, punctuation=".,;:!?()[]{}`''\"@#$^&amp;*+-|=~_", replace={})</pre><pre class="brush:python; gutter:false; light:true;">tag(string, tokenize=True, encoding='utf-8')</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import tag
&gt;&gt;&gt;
&gt;&gt;&gt; for word, pos in tag('I feel *happy*!')
&gt;&gt;&gt; if pos == "JJ": # Retrieve all adjectives.
&gt;&gt;&gt; print word
happy</pre></div>
<h3>Parser output</h3>
<p>The output of&nbsp;<span class="inline_code">parse()</span>&nbsp;is a string of sentences in which each word has been annotated with the requested tags. The <span class="inline_code">pprint()</span> function gives a human-readable breakdown of the tags (the extra <em>p-</em> is for <em>pretty</em>).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import parse
&gt;&gt;&gt; from pattern.en import pprint
&gt;&gt;&gt;
&gt;&gt;&gt; pprint(parse('I ate pizza.', relations=True, lemmata=True))
WORD TAG CHUNK ROLE ID PNP LEMMA
I PRP NP SBJ 1 - i
ate VBP VP - 1 - eat
pizza NN NP OBJ 1 - pizza
. . - - - - . </pre></div>
<p>The output of <span class="inline_code">parse()</span> is a subclass of <span class="inline_code">unicode</span> called&nbsp;<span class="inline_code">TaggedString</span>&nbsp;whose&nbsp;<span class="inline_code">TaggedString.split()</span> method by default yields a list of sentences, where each sentence is a list of tokens, where each token is a list of the word + its tags.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import parse
&gt;&gt;&gt; print parse('I ate pizza.').split()
[[[u'I', u'PRP', u'B-NP', u'O'],
[u'ate', u'VBD', u'B-VP', u'O'],
[u'pizza', u'NN', u'B-NP', u'O'],
[u'.', u'.', u'O', u'O']]] </pre></div>
<p>The most convenient way to analyze and mine the output is to construct&nbsp;a <a href="#tree" target="_self">parse tree</a>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="tree"></a>Parse trees</h2>
<p>A parse tree stores a tagged string as a tree of nested objects that can be traversed to analyze the constituents in the text. The <span class="inline_code">parsetree()</span> function takes the same parameters as <span class="inline_code">parse()</span> and returns a <span class="inline_code">Text</span> object.&nbsp;A&nbsp;<span class="inline_code">Text</span> is a list of <span class="inline_code">Sentence</span> objects. Each <span class="inline_code">Sentence</span> is a list of <span class="inline_code">Word</span> objects. <span class="inline_code">Word</span> objects can be grouped in <span class="inline_code">Chunk</span> objects, which are related to other <span class="inline_code">Chunk</span> objects.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">parsetree(string,
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = True, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate =&gt; eat)
encoding = 'utf-8' # Input string encoding.
tagset = None) # Penn Treebank II (default) or UNIVERSAL.
</pre><p>The following example shows the parse tree for the sentence "<em>The cat sat on the mat.</em>":</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; s = parsetree('The cat sat on the mat.', relations=True, lemmata=True)
&gt;&gt;&gt; print repr(s)
[Sentence(
u'The/DT/B-NP/O/NP-SBJ-1/the
cat/NN/I-NP/O/NP-SBJ-1/cat
sat/VBD/B-VP/O/VP-1/sit
on/IN/B-PP/B-PNP/O/on
the/DT/B-NP/I-PNP/O/the
mat/NN/I-NP/I-PNP/O/mat
././O/O/O/O/.')]</pre><pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; for sentence in s:
&gt;&gt;&gt; for chunk in sentence.chunks:
&gt;&gt;&gt; print chunk.type, [(w.string, w.type) for w in chunk.words]
NP [(u'the', u'DT'), (u'cat', u'NN')]
VP [(u'sat', u'VBD')]
PP [(u'on', u'IN')]
NP [(u'the', 'DT), (u'mat', u'NN')]
</pre></div>
<p>A common approach is to store output from <span class="inline_code">parse()</span>&nbsp;in a .txt file, with a tagged sentence on each line.&nbsp;The <span class="inline_code">tree()</span> function can be used to load it as a <span class="inline_code">Text</span> object. It has an optional <span class="inline_code">token</span> parameter that defines the format of the tokens (tagged words).&nbsp;So&nbsp;<span class="inline_code">parsetree(s)</span>&nbsp;is the same as&nbsp;<span class="inline_code">tree(parse(s)</span><span class="inline_code">)</span>.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">tree(taggedstring, token=[WORD, POS, CHUNK, PNP, REL, LEMMA])</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.en import tree
&gt;&gt;&gt;
&gt;&gt;&gt; for sentence in tree(open('tagged.txt'), token=[WORD, POS, CHUNK])
&gt;&gt;&gt; print sentence</pre></div>
<h3>Text</h3>
<p>A <span class="inline_code">Text</span> is a list of <span class="inline_code">Sentence</span> objects (i.e., it can be iterated with&nbsp;<span class="inline_code">for</span> <span class="inline_code">sentence</span> <span class="inline_code">in</span> <span class="inline_code">text:</span>).</p>
<pre class="brush:python; gutter:false; light:true;">text = Text(taggedstring, token=[WORD, POS, CHUNK, PNP, REL, LEMMA])</pre><pre class="brush:python; gutter:false; light:true;">text = Text.from_xml(xml) # Reads an XML string generated with Text.xml.
</pre><pre class="brush:python; gutter:false; light:true;">text.string # 'The cat sat on the mat .'
text.sentences # [Sentence('The cat sat on the mat .')]
text.copy()
text.xml</pre><h3>Sentence</h3>
<p>A <span class="inline_code">Sentence</span> is a list of <span class="inline_code">Word</span> objects, with attributes and methods that group words in <span class="inline_code">Chunk</span> objects.</p>
<pre class="brush:python; gutter:false; light:true;">sentence = Sentence(taggedstring, token=[WORD, POS, CHUNK, PNP, REL, LEMMA])</pre><pre class="brush:python; gutter:false; light:true;">sentence = Sentence.from_xml(xml)
</pre><pre class="brush:python; gutter:false; light:true;">sentence.parent # Sentence parent, or None.
sentence.id # Unique id for each sentence.
sentence.start # 0
sentence.stop # len(Sentence).
</pre><pre class="brush:python; gutter:false; light:true;">sentence.string # Tokenized string, without tags.
sentence.words # List of Word objects.
sentence.lemmata # List of word lemmata.
sentence.chunks # List of Chunk objects.
sentence.subjects # List of NP-SBJ chunks.
sentence.objects # List of NP-OBJ chunks.
sentence.verbs # List of VP chunks.
sentence.relations # {'SBJ': {1: Chunk('the cat/NP-SBJ-1')},
# 'VP': {1: Chunk('sat/VP-1')},
# 'OBJ': {}}
sentence.pnp # List of PNPChunks: [Chunk('on the mat/PNP')]
</pre><pre class="brush:python; gutter:false; light:true;">sentence.constituents(pnp=False)</pre><pre class="brush:python; gutter:false; light:true;">sentence.slice(start, stop)
sentence.copy()
sentence.xml
</pre><ul>
<li><span class="inline_code">Sentence.constituents()</span> returns a mixed, in-order list of <span class="inline_code">Word</span> and <span class="inline_code">Chunk</span> objects.<br />With <span class="inline_code">pnp=True</span>, it will yield&nbsp;<span class="inline_code">PNPChunk</span> objects whenever possible.</li>
<li><span class="inline_code">Sentence.slice()</span>&nbsp;returns a <span class="inline_code">Slice</span> (= a subclass of <span class="inline_code">Sentence</span>) starting with the word at index <span class="inline_code">start</span> and containing all words up to (not including) index <span class="inline_code">stop</span>.</li>
</ul>
<h3>Sentence words</h3>
<p>A <span class="inline_code">Sentence</span> is made up of <span class="inline_code">Word</span> objects, which are also grouped in <span class="inline_code">Chunk</span> objects:</p>
<pre class="brush:python; gutter:false; light:true;">word = Word(sentence, string, lemma=None, type=None, index=0)</pre><pre class="brush:python; gutter:false; light:true;">word.sentence # Sentence parent.
word.index # Sentence index of word.
word.string # String (Unicode).
word.lemma # String lemma, e.g. 'sat' =&gt; 'sit',
word.type # Part-of-speech tag (NN, JJ, VBD, ...)
word.chunk # Chunk parent, or None.
word.pnp # PNPChunk parent, or None.</pre><h3>Sentence chunks</h3>
<p>A <span class="inline_code">Chunk</span> is a list of <span class="inline_code">Word</span> objects that belong together. <br />Multiple chunks can be part of a <span class="inline_code">PNPChunk</span>, which start with a <span class="postag">PP</span> chunk followed by <span class="postag">NP</span> chunks.</p>
<pre class="brush:python; gutter:false; light:true;">chunk = Chunk(sentence, words=[], type=None, role=None, relation=None)</pre><pre class="brush:python; gutter:false; light:true;">chunk.sentence # Sentence parent.
chunk.start # Sentence index of first word.
chunk.stop # Sentence index of last word + 1.
chunk.string # String of words (Unicode).
chunk.words # List of Word objects.
chunk.lemmata # List of word lemmata.
chunk.head # Primary Word in the chunk.
chunk.type # Chunk tag (NP, VP, PP, ...)
chunk.role # Role tag (SBJ, OBJ, ...)
chunk.relation # Relation id, e.g. NP-SBJ-1 =&gt; 1.
chunk.relations # List of (id, role)-tuples.
chunk.related # List of Chunks with same relation id.
chunk.subject # NP-SBJ chunk with same id.
chunk.object # NP-OBJ chunk with same id.
chunk.verb # VP chunk with same id.
chunk.modifiers # []
chunk.conjunctions # []
chunk.pnp # PNPChunk parent, or None.
</pre><pre class="brush:python; gutter:false; light:true;">chunk.previous(type=None)
chunk.next(type=None)
chunk.nearest(type='VP')</pre><ul>
<li><span class="inline_code">Chunk.head</span> yields the primary&nbsp;<span class="inline_code">Word</span> in the chunk: <em>the big cat</em><em>cat</em>.</li>
<li><span class="inline_code">Chunk.relations</span>&nbsp;contains all relations the chunk is part of. <br />Some chunks have multiple relations, e.g., <span class="postag">SBJ</span> as well as&nbsp;<span class="postag">OBJ</span>, or&nbsp;<span class="postag">OBJ</span> of multiple <span class="postag">VP</span>'s.</li>
<li>For <span class="postag">VP</span> chunks, <span class="inline_code">Chunk.modifiers</span> is a list of nearby adjectives and adverbs that have no relations. <br />For example, in <em>the cat purred happily</em>, modifier of&nbsp;<em>purred</em>&nbsp;<em>happily</em>.</li>
<li><span class="inline_code">Chunk.conjunctions</span> is a list of chunks linked by <em>and</em>&nbsp;and&nbsp;<em>or</em> to this chunk. <br />For example in <em>up and down</em>: the <em>up</em> chunk has conjunctions: <span class="inline_code">[(Chunk('down'),</span> <span class="inline_code">AND)]</span>.</li>
</ul>
<h3>Prepositional noun phrases</h3>
<p>A <span class="inline_code">PNPChunk</span>&nbsp;or prepositional noun phrase is a subclass of <span class="inline_code">Chunk</span>.&nbsp;It groups <span class="postag">PP</span> + <span class="postag">NP</span> chunks (= <span class="postag">PNP</span>).</p>
<pre class="brush:python; gutter:false; light:true;">pnp = PNPChunk(sentence, words=[], type=None, role=None, relation=None)</pre><pre class="brush:python; gutter:false; light:true;">pnp.string # String of words (Unicode).
pnp.chunks # List of Chunk objects.
pnp.preposition # First PP chunk in the PNP.
</pre><p>Words and chunks that are part of a <span class="postag">PNP</span> will have their <span class="inline_code">Word.pnp</span> and <span class="inline_code">Chunk.pnp</span> attribute set.&nbsp;All prepositional noun phrases in a sentence can be retrieved with <span class="inline_code">Sentence.pnp</span>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="sentiment"></a>Sentiment</h2>
<p>Written text can be broadly categorized into two types: facts and opinions. Opinions carry people's sentiments, appraisals and feelings toward the world. The pattern.en module bundles a lexicon of adjectives (e.g., <em>good</em>, <em>bad</em>, <em>amazing</em>, <em>irritating</em>, ...) that occur frequently in product reviews, annotated with scores for sentiment polarity (positive ↔&nbsp;negative) and subjectivity (objective ↔ subjective).&nbsp;</p>
<p>The <span class="inline_code">sentiment()</span> function returns a <span class="inline_code">(polarity,</span> <span class="inline_code">subjectivity)</span>-tuple for the given sentence, based on the adjectives it contains,&nbsp;where polarity is a value between <span class="inline_code">-1.0</span> and +<span class="inline_code">1.0</span> and subjectivity between <span class="inline_code">0.0</span> and <span class="inline_code">1.0</span>.&nbsp;The sentence can be a string, <span class="inline_code">Text</span>, <span class="inline_code">Sentence</span>, <span class="inline_code">Chunk</span>,&nbsp;<span class="inline_code">Word</span> or a&nbsp;<span class="inline_code">Synset</span> (see below).&nbsp;</p>
<p>The <span class="inline_code">positive()</span> function returns <span class="inline_code">True</span> if the given sentence's polarity is above the threshold. The threshold can be lowered or raised, but overall <span class="inline_code">+0.1</span> gives the best results for product reviews. Accuracy is about 75% for movie reviews.</p>
<pre class="brush:python; gutter:false; light:true;">sentiment(sentence) # Returns a (polarity, subjectivity)-tuple.</pre><pre class="brush:python; gutter:false; light:true;">positive(s, threshold=0.1) # Returns True if polarity &gt;= threshold.</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import sentiment
&gt;&gt;&gt;
&gt;&gt;&gt; print sentiment(
&gt;&gt;&gt; "The movie attempts to be surreal by incorporating various time paradoxes,"
&gt;&gt;&gt; "but it's presented in such a ridiculous way it's seriously boring.")
(-0.34, 1.0) </pre></div>
<p>In the example above,&nbsp;<span class="inline_code">-0.34</span> is the average of&nbsp;<em>surreal</em>, <em>various</em>, <em>ridiculous</em> and <em>seriously boring</em>.&nbsp;To retrieve the scores for individual words, use the special <span class="inline_code">assessments</span> property, which yields a list of <span class="inline_code">(words,</span> <span class="inline_code">polarity,</span> <span class="inline_code">subjectivity,</span> <span class="inline_code">label)</span>-tuples.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; print sentiment('Wonderfully awful! :-)').assessments
[(['wonderfully', 'awful', '!'], -1.0, 1.0, None),
([':-)'], 0.5, 1.0, 'mood')]
</pre></div>
<p>&nbsp;&nbsp;</p>
<hr />
<h2><a name="modality"></a>Mood &amp; modality</h2>
<p>Grammatical mood refers to the use of auxiliary verbs (e.g., <em>could</em>, <em>would</em>) and adverbs (e.g., <em>definitely</em>,<em> maybe</em>) to express uncertainty.&nbsp;</p>
<p>The <span class="inline_code">mood()</span> function returns either&nbsp;<span class="inline_code">INDICATIVE</span>, <span class="inline_code">IMPERATIVE</span>, <span class="inline_code">CONDITIONAL</span>&nbsp;or <span class="inline_code">SUBJUNCTIVE</span>&nbsp;for a given parsed&nbsp;<span class="inline_code">Sentence</span>. See the table below for an overview of moods.</p>
<p>The <span class="inline_code">modality()</span> function returns the degree of certainty as a value between <span class="inline_code">-1.0</span> and <span class="inline_code">+1.0</span>, where values <span class="inline_code">&gt;</span> <span class="inline_code">+0.5</span> represent facts. For example, "<em>I wish it would stop raining"</em> scores <span class="inline_code">-0.35</span>, whereas "<em>It will stop raining"</em> scores <span class="inline_code">+0.75</span>. Accuracy is about 68% for Wikipedia texts.</p>
<pre class="brush:python; gutter:false; light:true;">mood(sentence) # Returns INDICATIVE | IMPERATIVE | CONDITIONAL | SUBJUNCTIVE</pre><pre class="brush:python; gutter:false; light:true;">modality(sentence) # Returns -1.0 =&gt; +1.0.</pre><table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Mood</span></td>
<td><span class="smallcaps">Form</span></td>
<td><span class="smallcaps">Use</span></td>
<td><span class="smallcaps">Example</span></td>
</tr>
<tr>
<td><span class="inline_code">INDICATIVE</span></td>
<td>none of the below&nbsp;</td>
<td>fact, belief</td>
<td><em>It rains.</em></td>
</tr>
<tr>
<td><span class="inline_code">IMPERATIVE</span></td>
<td>infinitive without <em>to</em></td>
<td>command, warning</td>
<td><em><span style="text-decoration: underline;">Do</span>n't rain!</em></td>
</tr>
<tr>
<td><span class="inline_code">CONDITIONAL</span></td>
<td><em>would</em>, <em>could</em>, <em>should</em>, <em>may</em>, or <em>will</em>,&nbsp;<em>can</em> + <em>if</em></td>
<td>conjecture</td>
<td><em>It <span style="text-decoration: underline;">might</span> rain.</em></td>
</tr>
<tr>
<td><span class="inline_code">SUBJUNCTIVE</span></td>
<td><em>wish</em>, <em>were</em>, or&nbsp;<em>it is</em> + infinitive</td>
<td>wish, opinion</td>
<td><em>I <span style="text-decoration: underline;">hope</span> it rains.</em></td>
</tr>
</tbody>
</table>
<p>For example:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.en import parse, Sentence, parse
&gt;&gt;&gt; from pattern.en import modality
&gt;&gt;&gt;
&gt;&gt;&gt; s = "Some amino acids tend to be acidic while others may be basic." # weaseling
&gt;&gt;&gt; s = parse(s, lemmata=True)
&gt;&gt;&gt; s = Sentence(s)
&gt;&gt;&gt;
&gt;&gt;&gt; print modality(s)
0.11</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="wordnet"></a>WordNet</h2>
<p>The pattern.en.wordnet module includes WordNet 3.0 and Oliver Steele's PyWordNet module. <a href="http://wordnet.princeton.edu/" target="_blank">WordNet</a> is a lexical database that groups related words into <span class="inline_code">Synset</span> objects (= sets of synonyms). Each synset provides a short definition and semantic relations to other synsets.</p>
<p>The <span class="inline_code">synsets()</span> function returns a list of <span class="inline_code">Synset</span> objects for a given word, where each set corresponds to a word sense (e.g., <em>tree</em> in the sense of plant, <em>tree</em> in the sense of diagram, etc.)</p>
<pre class="brush:python; gutter:false; light:true;">synset = wordnet.synsets(word, pos=NOUN)[i]</pre><pre class="brush:python; gutter:false; light:true;">synset.pos # Part-of-speech: NOUN | VERB | ADJECTIVE | ADVERB.
synset.synonyms # List of word forms (i.e., synonyms).
synset.gloss # Definition string.
synset.lexname # Category string, or None.
synset.ic # Information Content (float).
</pre><pre class="brush:python; gutter:false; light:true;">synset.antonym # Synset (semantic opposite).
synset.hypernym # Synset (semantic parent).</pre><pre class="brush:python; gutter:false; light:true;">synset.hypernyms(recursive=False, depth=None)
synset.hyponyms(recursive=False, depth=None)
synset.meronyms() # List of synsets (members/parts).
synset.holonyms() # List of synsets (of which this is a member).
synset.similar() # List of synsets (similar adjectives/verbs).</pre><ul>
<li><span class="inline_code">Synset.hypernyms()</span> returns a list of <em>&nbsp;</em>parent synsets (i.e., more general).</li>
<li><span class="inline_code">Synset.hyponyms()</span> returns a list child synsets (i.e., more specific).<br />With <span class="inline_code">recursive=True</span>, returns parents of parents or children of children.<br />Optionally, returns parents or children recursively up to the given <span class="inline_code">depth</span>.</li>
</ul>
<p>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import wordnet
&gt;&gt;&gt;
&gt;&gt;&gt; s = wordnet.synsets('bird')[0]
&gt;&gt;&gt;
&gt;&gt;&gt; print 'Definition:', s.gloss
&gt;&gt;&gt; print ' Synonyms:', s.synonyms
&gt;&gt;&gt; print ' Hypernyms:', s.hypernyms()
&gt;&gt;&gt; print ' Hyponyms:', s.hyponyms()
&gt;&gt;&gt; print ' Holonyms:', s.holonyms()
&gt;&gt;&gt; print ' Meronyms:', s.meronyms()
Definition: u'warm-blooded egg-laying vertebrates characterized '
'by feathers and forelimbs modified as wings'
Synonyms: [u'bird']
Hypernyms: [Synset(u'vertebrate')]
Hyponyms: [Synset(u'cock'), Synset(u'hen'), ...]
Holonyms: [Synset(u'Aves'), Synset(u'flock')]
Meronyms: [Synset(u'beak'), Synset(u'feather'), ...]</pre></div>
<div class="example"><span class="small"><span style="text-decoration: underline;">Reference</span>: Fellbaum, C. (1998). </span><em class="small">WordNet: An Electronic Lexical Database</em><span class="small">. Cambridge, MIT Press.</span></div>
<h3>Synset similarity</h3>
<p>The <span class="inline_code">ancestor()</span> function returns the common ancestor&nbsp;of two synsets.&nbsp;The <span class="inline_code">similarity()</span> function returns the semantic similarity of two synsets as a value between <span class="inline_code">0.0</span><span class="inline_code">1.0</span>.</p>
<pre class="brush:python; gutter:false; light:true;">wordnet.ancestor(synset1, synset2)</pre><pre class="brush:python; gutter:false; light:true;">wordnet.similarity(synset1, synset2)
</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import wordnet
&gt;&gt;&gt;
&gt;&gt;&gt; a = wordnet.synsets('cat')[0]
&gt;&gt;&gt; b = wordnet.synsets('dog')[0]
&gt;&gt;&gt; c = wordnet.synsets('box')[0]
&gt;&gt;&gt;
&gt;&gt;&gt; print wordnet.ancestor(a, b)
&gt;&gt;&gt;
&gt;&gt;&gt; print wordnet.similarity(a, a)
&gt;&gt;&gt; print wordnet.similarity(a, b)
&gt;&gt;&gt; print wordnet.similarity(a, c)
Synset('carnivore')
1.0
0.86
0.17 </pre></div>
<p>Similarity is calculated using Lin's formula and Resnik's Information Content (IC). IC values for each synset are derived from the word count in Brown corpus.</p>
<p><span class="inline_code">lin</span> <span class="inline_code">=</span> <span class="inline_code">2.0</span> <span class="inline_code">*</span> <span class="inline_code">log(ancestor(synset1,</span> <span class="inline_code">synset2).ic)</span> <span class="inline_code">/</span> <span class="inline_code">log(synset1.ic</span> <span class="inline_code">*</span> <span class="inline_code">synset2.ic)</span></p>
<h3>Synset sentiment</h3>
<p><a href="http://sentiwordnet.isti.cnr.it/" target="_blank">SentiWordNet</a> is a lexical resource for opinion mining, with polarity and subjectivity scores for all WordNet synsets. SentiWordNet is free for non-commercial research purposes. To use SentiWordNet, request a download from the authors and put&nbsp;<span class="inline_code">SentiWordNet*.txt</span> in&nbsp;<span class="inline_code">pattern/en/wordnet/</span>.&nbsp;You can then use&nbsp;<span class="inline_code">Synset.weight()</span> in your script:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import wordnet
&gt;&gt;&gt; from pattern.en import ADJECTIVE
&gt;&gt;&gt;
&gt;&gt;&gt; print wordnet.synsets('happy', ADJECTIVE)[0].weight
&gt;&gt;&gt; print wordnet.synsets('sad', ADJECTIVE)[0].weight
(0.375, 0.875)
(-0.625, 0.875)
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="wordlist"></a>Wordlists</h2>
<p>The patten.en module includes a number of general-purpose word lists:</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">List</span></td>
<td><span class="smallcaps">Description</span></td>
<td style="text-align: center;"><span class="smallcaps">Size</span></td>
<td><span class="smallcaps">Example</span></td>
</tr>
<tr>
<td><span class="inline_code">ACADEMIC</span></td>
<td>English academic words</td>
<td style="text-align: center;">500</td>
<td><em>criterion</em>, <em>proportionally</em>, <em>research</em></td>
</tr>
<tr>
<td><span class="inline_code">BASIC</span></td>
<td>English basic words</td>
<td style="text-align: center;">1,000</td>
<td><em>chicken</em>, <em>pain</em>, <em>road</em></td>
</tr>
<tr>
<td><span class="inline_code">PROFANITY</span></td>
<td>English swear words</td>
<td style="text-align: center;">350</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">TIME</span></td>
<td>English time &amp; date words</td>
<td style="text-align: center;">100</td>
<td><em>Christmas</em>, <em>past</em>, <em>saturday</em></td>
</tr>
</tbody>
</table>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en.wordlist import ACADEMIC
&gt;&gt;&gt;
&gt;&gt;&gt; words = open('paper.txt').read().split()
&gt;&gt;&gt; words = [w for w in words if w not in ACADEMIC] </pre></div>
<p>&nbsp;</p>
<hr />
<h2>See also</h2>
<ul>
<li><a href="http://www.clips.ua.ac.be/pages/MBSP" target="_blank">MBSP</a> (GPL): r<span>obust parser using a memory-based learning approach, in Python.</span></li>
<li><span><a href="http://www.nltk.org/" target="_blank">NLTK</a> (Apache): f</span><span>ull natural language processing toolkit for Python.</span></li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,579 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-es</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-es" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-es</a></div>
<h1>pattern.es</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1626" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">The pattern.es module contains a fast part-of-speech tagger for Spanish (identifies nouns, adjectives, verbs, etc. in a sentence) and tools for Spanish verb conjugation and noun singularization &amp; pluralization.</span></p>
<p>It can be used by itself or with other&nbsp;<a href="pattern.html">pattern</a>&nbsp;modules:&nbsp;<a href="pattern-web.html">web</a>&nbsp;|&nbsp;<a href="pattern-db.html">db</a>&nbsp;| <a href="pattern-en.html">en</a>&nbsp;|&nbsp;<a href="pattern-search.html">search</a>&nbsp;|&nbsp;<a href="pattern-vector.html">vector</a>&nbsp;|&nbsp;<a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema_es.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<p>The functions in this module take the same parameters and return the same values as their counterparts in <a href="pattern-en.html">pattern.en</a>. Refer to the documentation there for more details.&nbsp;&nbsp;</p>
<h3>Noun singularization &amp; pluralization</h3>
<p>For Spanish nouns there is <span class="inline_code">singularize()</span> and <span class="inline_code">pluralize()</span>.&nbsp;The implementation is slightly less robust than the English version (accuracy 94% for singularization and 78% for pluralization).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.es import singularize, pluralize
&gt;&gt;&gt;
&gt;&gt;&gt; print singularize('gatos')
&gt;&gt;&gt; print pluralize('gato')
gato
gatos </pre></div>
<h3>Verb conjugation</h3>
<p>For Spanish verbs there is <span class="inline_code">conjugate()</span>, <span class="inline_code">lemma()</span>, <span class="inline_code">lexeme()</span> and <span class="inline_code">tenses()</span>.&nbsp;The lexicon for verb conjugation contains about 600 common Spanish verbs, composed by Fred Jehle. For unknown verbs it will fall back to a rule-based approach with an accuracy of about 84%.&nbsp;</p>
<p>Spanish verbs have more tenses than English verbs. In particular, the plural differs for each person, and there are additional forms for the&nbsp;<span class="inline_code">FUTURE</span>&nbsp;and&nbsp;<span class="inline_code">CONDITIONAL</span>&nbsp;tense, the&nbsp;<span class="inline_code">IMPERATIVE</span>&nbsp;and&nbsp;<span class="inline_code">SUBJUNCTIVE</span>&nbsp;mood and the&nbsp;<span class="inline_code">PERFECTIVE</span>&nbsp;aspect:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.es import conjugate
&gt;&gt;&gt; from pattern.es import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE, PERFECTIVE
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('soy', INFINITIVE)
&gt;&gt;&gt; print conjugate('soy', PRESENT, 1, SG, mood=SUBJUNCTIVE)
&gt;&gt;&gt; print conjugate('soy', PAST, 3, SG)
&gt;&gt;&gt; print conjugate('soy', PAST, 3, SG, aspect=PERFECTIVE)
ser
sea
era
fue </pre></div>
<p>For <span class="inline_code">PAST</span>&nbsp;tense + <span class="inline_code">PERFECTIVE</span>&nbsp;aspect we can also use <span class="inline_code">PRETERITE</span>. For <span class="inline_code">PAST</span>&nbsp;tense + <span class="inline_code">IMPERFECTIVE</span>&nbsp;aspect we can also use <span class="inline_code">IMPERFECT</span>:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.es import conjugate
&gt;&gt;&gt; from pattern.es import IMPERFECT, PRETERITE
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('soy', IMPERFECT, 3, SG)
&gt;&gt;&gt; print conjugate('soy', PRETERITE, 3, SG)
era
fue </pre></div>
<p>&nbsp;The <span class="inline_code">conjugate()</span> function takes the following optional parameters:</p>
<table class="border">
<tbody>
<tr>
<td class="smallcaps">Tense</td>
<td class="smallcaps">Person</td>
<td class="smallcaps">Number</td>
<td class="smallcaps">Mood</td>
<td class="smallcaps">Aspect</td>
<td class="smallcaps">Alias</td>
<td class="smallcaps">Example</td>
</tr>
<tr>
<td class="inline_code">INFINITVE</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">"inf"</td>
<td><em>ser</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg"</td>
<td><em>yo&nbsp;<span style="text-decoration: underline;">soy</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg"</td>
<td><em>&nbsp;<span style="text-decoration: underline;">eres</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg"</td>
<td><em>el&nbsp;<span style="text-decoration: underline;">es</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl"</td>
<td><em>nosotros&nbsp;<span style="text-decoration: underline;">somos</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl"</td>
<td><em>vosotros&nbsp;<span style="text-decoration: underline;">sois</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl"</td>
<td><em>ellos&nbsp;<span style="text-decoration: underline;">son</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"part"</td>
<td><em>siendo</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg!"</td>
<td><em></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl!"</td>
<td><em>sed</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg?"</td>
<td><em>yo&nbsp;<span style="text-decoration: underline;">sea</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg?"</td>
<td><em>&nbsp;<span style="text-decoration: underline;">seas</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg?"</td>
<td><em>el&nbsp;<span style="text-decoration: underline;">sea</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl?"</td>
<td><em>nosotros&nbsp;<span style="text-decoration: underline;">seamos</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl?"</td>
<td><em>vosotros&nbsp;<span style="text-decoration: underline;">seáis</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl?"</td>
<td><em>ellos&nbsp;<span style="text-decoration: underline;">sean</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp"</td>
<td><em>yo&nbsp;<span style="text-decoration: underline;">era</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp"</td>
<td><em>&nbsp;<span style="text-decoration: underline;">eras</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp"</td>
<td><em>el&nbsp;<span style="text-decoration: underline;">era</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl"</td>
<td><em>nosotros&nbsp;<span style="text-decoration: underline;">éramos</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl"</td>
<td><em>vosotros&nbsp;<span style="text-decoration: underline;">erais</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl"</td>
<td><em>ellos&nbsp;<span style="text-decoration: underline;">eran</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"ppart"</td>
<td><em>sido</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"1sgp+"</td>
<td><em>yo&nbsp;<span style="text-decoration: underline;">fui</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"2sgp+"</td>
<td><em>&nbsp;<span style="text-decoration: underline;">fuiste</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"3sgp+"</td>
<td><em>el&nbsp;<span style="text-decoration: underline;">fue</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"1ppl+"</td>
<td><em>nosotros&nbsp;<span style="text-decoration: underline;">fuimos</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"2ppl+"</td>
<td><em>vosotros&nbsp;<span style="text-decoration: underline;">fuisteis</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"3ppl+"</td>
<td><em>ellos&nbsp;<span style="text-decoration: underline;">fueron</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp?"</td>
<td><em>yo&nbsp;<span style="text-decoration: underline;">fuera</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp?"</td>
<td><em>&nbsp;<span style="text-decoration: underline;">fueras</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp?"</td>
<td><em>el&nbsp;<span style="text-decoration: underline;">fuera</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl?"</td>
<td><em>nosotros&nbsp;<span style="text-decoration: underline;">fuéramos</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl?"</td>
<td><em>vosotros&nbsp;<span style="text-decoration: underline;">fuerais</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl?"</td>
<td><em>ellos&nbsp;<span style="text-decoration: underline;">fueran</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgf"</td>
<td><em>yo&nbsp;<span style="text-decoration: underline;">seré</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgf"</td>
<td><em>&nbsp;<span style="text-decoration: underline;">serás</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgf"</td>
<td><em>el&nbsp;<span style="text-decoration: underline;">será</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1plf"</td>
<td><em>nosotros&nbsp;<span style="text-decoration: underline;">seremos</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2plf"</td>
<td><em>vosotros&nbsp;<span style="text-decoration: underline;">seréis</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3plf"</td>
<td><em>ellos&nbsp;<span style="text-decoration: underline;">serán</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg-&gt;"</td>
<td><em>yo&nbsp;<span style="text-decoration: underline;">sería</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg-&gt;"</td>
<td><em>&nbsp;<span style="text-decoration: underline;">serías</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg-&gt;"</td>
<td><em>el&nbsp;<span style="text-decoration: underline;">sería</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl-&gt;"</td>
<td><em>nosotros&nbsp;<span style="text-decoration: underline;">seríamos</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl-&gt;"</td>
<td><em>vosotros&nbsp;<span style="text-decoration: underline;">seríais</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl-&gt;"</td>
<td><em>ellos&nbsp;<span style="text-decoration: underline;">serían</span></em></td>
</tr>
</tbody>
</table>
<p>Instead of optional parameters, a single short alias, or&nbsp;<span class="inline_code">PARTICIPLE</span> or <span class="inline_code">PAST+PARTICIPLE</span> can also be given. With no parameters, the infinitive form of the verb is returned.</p>
<p class="small"><span style="text-decoration: underline;">Reference</span><span>: Jehle, F. (2012).&nbsp;<em>Spanish Verb Forms</em>. Retrieved from:&nbsp;</span><span><a class="noexternal" style="color: inherit;" href="http://users.ipfw.edu/jehle/verblist.htm" target="_blank">http://users.ipfw.edu/jehle/verblist.htm</a>.</span></p>
<h3>Attributive &amp; predicative adjectives&nbsp;</h3>
<p>Spanish adjectives inflect with an <span class="inline_code">-o</span>,&nbsp;<span class="inline_code">-a</span>&nbsp;, <span class="inline_code">-os</span>, <span class="inline_code">-as</span>, or <span class="inline_code">-es</span> suffix (e.g., <em>curioso</em>&nbsp;<em>los gatos curiosos</em>) depending on gender. You can get the base form with the <span class="inline_code">predicative()</span> function, or vice versa with&nbsp;<span class="inline_code">attributive()</span>.&nbsp;For predicative, a statistical approach is used with an accuracy of 93%. For attributive, you need to supply gender (<span class="inline_code">MALE</span>, <span class="inline_code">FEMALE</span>, <span class="inline_code">NEUTRAL</span>&nbsp;and/or <span class="inline_code">PLURAL</span>).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.es import attributive, predicative
&gt;&gt;&gt; from pattern.es import FEMALE, PLURAL
&gt;&gt;&gt;
&gt;&gt;&gt; print predicative('curiosos')
&gt;&gt;&gt; print attributive('curioso', gender=FEMALE)
&gt;&gt;&gt; print attributive('curioso', gender=FEMALE+PLURAL)
curioso
curiosa
curiosas </pre></div>
<h3>Parser</h3>
<p>For parsing there is <span class="inline_code" style="font-family: Courier, monospace; font-size: 12px;">parse()</span>, <span class="inline_code">parsetree()</span> and&nbsp;<span class="inline_code" style="font-family: Courier, monospace; font-size: 12px;">split()</span>. The <span class="inline_code">parse()</span> function annotates words in the given string with their part-of-speech <a class="link-maintenance" href="mbsp-tags.html">tags</a>&nbsp;(e.g., <span class="postag">NN</span> for nouns and <span class="postag">VB</span> for verbs). The <span class="inline_code">parsetree()</span> function takes a string and returns a tree of nested objects (<span class="inline_code">Text</span>&nbsp;<span class="inline_code">Sentence</span>&nbsp;<span class="inline_code">Chunk</span>&nbsp;<span class="inline_code">Word</span>). The <span class="inline_code">split()</span> function takes the output of <span class="inline_code">parse()</span> and returns a <span class="inline_code">Text</span>.&nbsp;See the <span class="inline_code">pattern.en</span> documentation&nbsp;(<span class="link-maintenance" style="color: #78aaff;"><a style="color: #8caaff; outline-style: none !important; outline-width: initial !important; outline-color: initial !important;" href="pattern-en.html#tree">here</a></span>) how to manipulate <span class="inline_code">Text</span>&nbsp;objects.&nbsp;</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.es import parse, split
&gt;&gt;&gt;
&gt;&gt;&gt; s = parse('El gato negro se sienta en la estera.')
&gt;&gt;&gt; for sentence in split(s):
&gt;&gt;&gt; print sentence
Sentence('El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O'
'se/PRP/B-NP/O sienta/VB/B-VP/O'
'en/IN/B-PP/B-PNP la/DT/B-NP/I-PNP estera/NN/I-NP/I-PNP ././O/O')</pre></div>
<p>The parser is trained on the Spanish portion of <a href="http://www.lsi.upc.edu/~nlp/wikicorpus/" target="_blank">Wikicorpus </a>&nbsp;using 1.5M words from the tagged sections 10,00015,000. The accuracy is around 92%.&nbsp;The original <a href="http://www.lsi.upc.edu/~nlp/SVMTool/parole.html" target="_blank">Parole</a>&nbsp;tagset is mapped to <a href="mbsp-tags.html">Penn Treebank</a> tagset. If you need to work with the original tags you can also use&nbsp;<span class="inline_code">parse()</span> with an optional parameter <span class="inline_code">tagset="parole"</span>.</p>
<p class="small"><span style="text-decoration: underline;">Reference</span>:&nbsp;Reese, S., Boleda, G., Cuadros, M., Padró, L., Rigau, G (2010).&nbsp;<br />Wikicorpus: A Word-Sense Disambiguated Multilingual Wikipedia Corpus.&nbsp;<em>Proceedings of LREC'10</em>.&nbsp;</p>
<h3>Sentiment analysis</h3>
<p>There's no&nbsp;<span class="inline_code">sentiment()</span> function for Spanish yet.</p>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,590 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-fr</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-fr" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-fr</a></div>
<h1>pattern.fr</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1697" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">The pattern.fr module contains a fast part-of-speech tagger for French (identifies nouns, adjectives, verbs, etc. in a sentence), sentiment analysis, and tools for French verb conjugation and noun singularization &amp; pluralization.</span></p>
<p>It can be used by itself or with other&nbsp;<a href="pattern.html">pattern</a>&nbsp;modules:&nbsp;<a href="pattern-web.html">web</a>&nbsp;|&nbsp;<a href="pattern-db.html">db</a>&nbsp;| <a href="pattern-en.html">en</a>&nbsp;|&nbsp;<a href="pattern-search.html">search</a>&nbsp;|&nbsp;<a href="pattern-vector.html">vector</a>&nbsp;|&nbsp;<a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema_fr.gif" alt="" /></p>
<hr />
<h2>Documentation</h2>
<p>The functions in this module take the same parameters and return the same values as their counterparts in <a href="pattern-en.html">pattern.en</a>. Refer to the documentation there for more details.&nbsp;&nbsp;</p>
<h3>Noun singularization &amp; pluralization</h3>
<p>For French nouns there is <span class="inline_code">singularize()</span> and <span class="inline_code">pluralize()</span>.&nbsp;The implementation uses a statistical approach with 93% accuracy for singularization and 92% for pluralization.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.fr import singularize, pluralize
&gt;&gt;&gt;
&gt;&gt;&gt; print singularize('chats')
&gt;&gt;&gt; print pluralize('chat')
chat
chats </pre></div>
<h3>Verb conjugation</h3>
<p>For French verbs there is <span class="inline_code">conjugate()</span>, <span class="inline_code">lemma()</span>, <span class="inline_code">lexeme()</span> and <span class="inline_code">tenses()</span>.&nbsp;The lexicon for verb conjugation contains about 1,750 common French verbs (constructed with Bob Salita's verb conjugation rules).&nbsp;For unknown verbs it will fall back to regular expressions with an accuracy of about 83%.&nbsp;</p>
<p>French verbs have more tenses than English verbs. In particular, the plural differs for each person, and there are additional forms for the&nbsp;<span class="inline_code">FUTURE</span>&nbsp;tense, the&nbsp;<span class="inline_code">IMPERATIVE</span>, <span class="inline_code">CONDITIONAL</span> and&nbsp;<span class="inline_code">SUBJUNCTIVE</span>&nbsp;mood and the&nbsp;<span class="inline_code">PERFECTIVE</span>&nbsp;aspect:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.fr import conjugate
&gt;&gt;&gt; from pattern.fr import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE, PERFECTIVE
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('suis', INFINITIVE)
&gt;&gt;&gt; print conjugate('suis', PRESENT, 1, SG, mood=SUBJUNCTIVE)
&gt;&gt;&gt; print conjugate('suis', PAST, 3, SG)
&gt;&gt;&gt; print conjugate('suis', PAST, 3, SG, aspect=PERFECTIVE)
être
sois
était
fut </pre></div>
<p>For <span class="inline_code">PAST</span>&nbsp;tense + <span class="inline_code">PERFECTIVE</span>&nbsp;aspect we can also use <span class="inline_code">PRETERITE</span>&nbsp;(<em>passé simple</em>). For <span class="inline_code">PAST</span>&nbsp;tense + <span class="inline_code">IMPERFECTIVE</span>&nbsp;aspect we can also use <span class="inline_code">IMPERFECT</span>&nbsp;(<em>imparfait</em>):</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.fr import conjugate
&gt;&gt;&gt; from pattern.fr import IMPERFECT, PRETERITE
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('suis', IMPERFECT, 3, SG)
&gt;&gt;&gt; print conjugate('suis', PRETERITE, 3, SG)
était
fut </pre></div>
<p>&nbsp;The <span class="inline_code">conjugate()</span> function takes the following optional parameters:</p>
<table class="border">
<tbody>
<tr>
<td class="smallcaps">Tense</td>
<td class="smallcaps">Person</td>
<td class="smallcaps">Number</td>
<td class="smallcaps">Mood</td>
<td class="smallcaps">Aspect</td>
<td class="smallcaps">Alias</td>
<td class="smallcaps">Example</td>
</tr>
<tr>
<td class="inline_code">INFINITVE</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">"inf"</td>
<td><em>être</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg"</td>
<td><em>je&nbsp;<span style="text-decoration: underline;">suis</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">es</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg"</td>
<td><em>il&nbsp;<span style="text-decoration: underline;">est</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl"</td>
<td><em>nous&nbsp;<span style="text-decoration: underline;">sommes</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl"</td>
<td><em>vous&nbsp;<span style="text-decoration: underline;">êtes</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl"</td>
<td><em>ils&nbsp;<span style="text-decoration: underline;">sont</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"part"</td>
<td><em>étant</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg!"</td>
<td><em>sois</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl!"</td>
<td><em>soyons</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl!"</td>
<td><em>soyez</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg-&gt;"</td>
<td><em>je&nbsp;<span style="text-decoration: underline;">serais</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg-&gt;"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">serais</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg-&gt;"</td>
<td><em>il&nbsp;<span style="text-decoration: underline;">serait</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl-&gt;"</td>
<td><em>nous&nbsp;<span style="text-decoration: underline;">serions</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl-&gt;"</td>
<td><em>vous&nbsp;<span style="text-decoration: underline;">seriez</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl-&gt;"</td>
<td><em>ils&nbsp;<span style="text-decoration: underline;">seraient</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg?"</td>
<td><em>je&nbsp;<span style="text-decoration: underline;">sois</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg?"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">sois</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg?"</td>
<td><em>il&nbsp;<span style="text-decoration: underline;">soit</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl?"</td>
<td><em>nous&nbsp;<span style="text-decoration: underline;">soyons</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl?"</td>
<td><em>vous&nbsp;<span style="text-decoration: underline;">soyez</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl?"</td>
<td><em>ils&nbsp;<span style="text-decoration: underline;">soient</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp"</td>
<td><em>j'&nbsp;<span style="text-decoration: underline;">étais</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">étais</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp"</td>
<td><em>il&nbsp;<span style="text-decoration: underline;">était</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl"</td>
<td><em>nous&nbsp;<span style="text-decoration: underline;">étions</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl"</td>
<td><em>vous&nbsp;<span style="text-decoration: underline;">étiez</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl"</td>
<td><em>ils&nbsp;<span style="text-decoration: underline;">étaient</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"ppart"</td>
<td><em>été</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"1sgp+"</td>
<td><em>je&nbsp;<span style="text-decoration: underline;">fus</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"2sgp+"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">fus</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"3sgp+"</td>
<td><em>il&nbsp;<span style="text-decoration: underline;">fut</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"1ppl+"</td>
<td><em>nous&nbsp;<span style="text-decoration: underline;">fûmes</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"2ppl+"</td>
<td><em>vous&nbsp;<span style="text-decoration: underline;">fûtes</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"3ppl+"</td>
<td><em>ils&nbsp;<span style="text-decoration: underline;">furent</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp?"</td>
<td><em>je&nbsp;<span style="text-decoration: underline;">fusse</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp?"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">fusses</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp?"</td>
<td><em>il&nbsp;<span style="text-decoration: underline;">fût</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl?"</td>
<td><em>nous&nbsp;<span style="text-decoration: underline;">fussions</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl?"</td>
<td><em>vous&nbsp;<span style="text-decoration: underline;">fussiez</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl?"</td>
<td><em>ils&nbsp;<span style="text-decoration: underline;">fussent</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgf"</td>
<td><em>je&nbsp;<span style="text-decoration: underline;">serai</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgf"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">seras</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgf"</td>
<td><em>il&nbsp;<span style="text-decoration: underline;">sera</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1plf"</td>
<td><em>nous&nbsp;<span style="text-decoration: underline;">serons</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2plf"</td>
<td><em>vous&nbsp;<span style="text-decoration: underline;">serez</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3plf"</td>
<td><em>ils&nbsp;<span style="text-decoration: underline;">seron</span></em></td>
</tr>
</tbody>
</table>
<p>Instead of optional parameters, a single short alias, or&nbsp;<span class="inline_code">PARTICIPLE</span> or <span class="inline_code">PAST+PARTICIPLE</span> can also be given. With no parameters, the infinitive form of the verb is returned.</p>
<p class="small"><span style="text-decoration: underline;">Reference</span><span>: Salita, B. (2011).&nbsp;<em>French Verb Conjugation Rules</em>. Retrieved from:&nbsp;</span><span><a class="noexternal" style="color: inherit;" href="http://fvcr.sourceforge.net/" target="_blank">http://fvcr.sourceforge.net</a>.</span></p>
<h3>Attributive &amp; predicative adjectives&nbsp;</h3>
<p>French adjectives inflect with an <span class="inline_code">-e</span>,&nbsp;<span class="inline_code">-s</span>&nbsp; or&nbsp;<span class="inline_code">-es</span>&nbsp;suffix depending on gender. There are many irregular cases (e.g., <em>curieux</em>&nbsp;<em>une fille curieuse</em>). You can get the base form with the <span class="inline_code">predicative()</span> function.&nbsp;A statistical approach is used with an accuracy of 95%.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.fr import predicative
&gt;&gt;&gt; print predicative('curieuse')
curieux </pre></div>
<h3>Sentiment analysis</h3>
<p class="example">For opinion mining there is <span class="inline_code">sentiment()</span>, which returns a (<span class="inline_code">polarity</span>, <span class="inline_code">subjectivity</span>)-tuple, based on a lexicon of adjectives. Polarity is a value between <span class="inline_code">-1.0</span> and <span class="inline_code">+1.0</span>, subjectivity between <span class="inline_code">0.0</span> and <span class="inline_code">1.0</span>. The accuracy is around 74% (P 0.77, R 0.73) for book reviews:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.fr import sentiment
&gt;&gt;&gt; print sentiment('Un livre magnifique!')
(1.0, 1.0) </pre></div>
<h3>Parser</h3>
<p>For parsing there is <span class="inline_code">parse()</span>, <span class="inline_code">parsetree()</span> and&nbsp;<span class="inline_code">split()</span>. The <span class="inline_code">parse()</span> function annotates words in the given string with their part-of-speech <a class="link-maintenance" href="mbsp-tags.html">tags</a>&nbsp;(e.g., <span class="postag">NN</span> for nouns and <span class="postag">VB</span> for verbs). The <span class="inline_code">parsetree()</span> function takes a string and returns a tree of nested objects (<span class="inline_code">Text</span>&nbsp;<span class="inline_code">Sentence</span>&nbsp;<span class="inline_code">Chunk</span>&nbsp;<span class="inline_code">Word</span>). The <span class="inline_code">split()</span> function takes the output of <span class="inline_code">parse()</span> and returns a <span class="inline_code">Text</span>.&nbsp;See the <span class="inline_code">pattern.en</span> documentation&nbsp;(<span class="link-maintenance" style="color: #78aaff;"><a style="color: #8caaff; outline-style: none !important; outline-width: initial !important; outline-color: initial !important;" href="pattern-en.html#tree">here</a></span>) how to manipulate <span class="inline_code">Text</span>&nbsp;objects.&nbsp;</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.fr import parse, split
&gt;&gt;&gt;
&gt;&gt;&gt; s = parse(u"Le chat noir s'était assis sur le tapis.")
&gt;&gt;&gt; for sentence in split(s):
&gt;&gt;&gt; print sentence
Sentence('Le/DT/B-NP/O chat/NN/I-NP/O noir/JJ/I-NP/O'
"s'/PRP/B-NP/O était/VB/B-VP/O assis/VBN/I-VP/O"
'sur/IN/B-PP/B-PNP le/DT/B-NP/I-PNP tapis/NN/I-NP/I-PNP ././O/O')
</pre></div>
<p>The parser is based on <a href="http://alpage.inria.fr/~sagot/lefff-en.html">Le<em>fff</em></a>. For words in Le<em>fff</em> that can have multiple part-of-speech tags, we used <a href="http://www.lexique.org/">Lexique</a> to find the most frequent POS-tag.&nbsp;</p>
<p class="small"><span style="text-decoration: underline;">References</span>:&nbsp;</p>
<p class="small">Sagot, B. (2010).&nbsp;The Le<em>fff</em>, a freely available and large-coverage morphological and syntantic lexicon for French.&nbsp;<em>Proceedings of LREC'10</em>.</p>
<p class="small">New, B., Pallier, C., Ferrand, L. &amp; Matos, R. (2001). A lexical database for contemporary french: LEXIQUE. <em>L'année Psychologique</em>.&nbsp;</p>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,431 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-graph</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-graph" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-graph</a></div>
<h1>pattern.graph</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1392" class="node node-type-page"><div class="node-inner">
<div class="content">
<p class="big"><span style="font-size: 16px;">The pattern.graph module has tools for graph analysis (shortest path, centrality) and graph visualization in the browser. A graph is a network of nodes connected by edges. It can be used for example to study social networks or to model semantic relationships between concepts.</span></p>
<p>It can be used by itself or with other <a href="pattern.html">pattern</a> modules: <a href="pattern-web.html">web</a> | <a href="pattern-db.html">db</a> | <a href="pattern-en.html">en</a> | <a href="pattern-search.html">search</a> | <a href="pattern-vector.html">vector</a> | graph.</p>
<p><img style="border: 0px initial initial;" src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<ul>
<li><a href="#node">Node</a></li>
<li><a href="#edge">Edge</a></li>
<li><a href="#graph">Graph</a></li>
<li><a href="#layout">Graph layout</a></li>
<li><a href="#utility">Graph adjacency</a></li>
<li><a href="#canvas">Visualization</a>&nbsp;<span class="link-maintenance">(</span><a class="link-maintenance" href="#canvas"><span class="smallcaps link-maintenance">export</span></a><span class="link-maintenance">)</span></li>
<li><a href="#javascript">graph.js</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="node"></a>Node</h2>
<p>A <span class="inline_code">Node</span> is an element with a unique id (a string or <span class="inline_code">int</span>) in a graph. A graph is a network of nodes and edges (connections between nodes). For example, the World Wide Web (WWW) can be represented as a vast graph with websites as nodes, website URLs as node id's, and hyperlinks as edges. Graph analysis can then be used to find important nodes (i.e., popular websites) and the shortest path between them.</p>
<p>A <span class="inline_code">Node</span> takes a number of optional parameters used to style the graph <a class="link-maintenance" href="#canvas">visualization</a> of the graph: <span class="inline_code">radius</span> (node size), <span class="inline_code">text</span>, <span class="inline_code">fill</span> and <span class="inline_code">stroke</span> (colors; each a tuple of <a href="http://en.wikipedia.org/wiki/RGBA">RGBA</a> values between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>), <span class="inline_code">strokewidth</span>, <span class="inline_code">font</span>, <span class="inline_code">fontsize</span> and <span class="inline_code">fontweight</span>.</p>
<pre class="brush:python; gutter:false; light:true;">node = Node(id="", **kwargs)</pre><pre class="brush:python; gutter:false; light:true;">node.graph # Parent Graph.
node.id # Unique string or int.
node.links # List of Node objects.
node.edges # List of Edge objects.
node.edge(node, reverse=False)
</pre><pre class="brush:python; gutter:false; light:true;">node.weight # Eigenvector centrality (0.0-1.0).
node.centrality # Betweenness centrality (0.0-1.0).
node.degree # Degree centrality (0.0-1.0). </pre><pre class="brush:python; gutter:false; light:true;">node.x # 2D horizontal offset.
node.y # 2D vertical offset.
node.force # 2D Vector, updated by Graph.layout.
node.radius # Default: 5
node.fill # Default: None
node.stroke # Default: (0,0,0,1)
node.strokewidth # Default: 1
node.text # Text object, or None.</pre><pre class="brush:python; gutter:false; light:true;">node.flatten(depth=1, traversable=lambda node, edge: True)
</pre><ul>
<li><span class="inline_code">Node.edge(node)</span> returns the <span class="inline_code">Edge</span> from this node to the given <span class="inline_code">node</span>, or <span class="inline_code">None</span>.</li>
<li><span class="inline_code">Node.flatten()</span> returns a list with the node itself (<span class="inline_code">depth=0</span>), directly connected nodes (<span class="inline_code">depth=1</span>), nodes connected to those nodes (<span class="inline_code">depth=2</span>), and so on.</li>
</ul>
<p><span class="smallcaps">node weight and centrality</span></p>
<p>A well-known task in graph analysis is measuring how important or <em>central</em> each node in the graph is. The pattern.graph module has three centrality measurements, adopted from <a href="http://networkx.lanl.gov/">NetworkX</a>.</p>
<p><span class="inline_code">Node.weight</span> is the node's <em>eigenvector</em> centrality (= incoming traffic) as a value between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>. Nodes with more (indirect) incoming edges have a higher weight. For example, in the WWW, popular websites are those that are often linked to, where the popularity of the referring websites is taken into account.</p>
<p><span class="inline_code">Node.centrality</span> is the node's <em>betweenness</em> centrality (= passing traffic) as a value between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>. Nodes that occur more frequently in paths between other nodes have a higher betweenness. They are often found at the intersection of different clusters of nodes (e.g., like a broker or a bridge).</p>
<p><span class="inline_code">Node.degree</span> is the node's <em>degree</em> centrality (= local traffic) as a value between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>. Nodes with more edges have a higher degree.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="edge"></a>Edge</h2>
<p>An <span class="inline_code">Edge</span> is a connection between two nodes. Its <span class="inline_code">weight</span> defines the importance of the connection. Edges with a higher weight are preferred when traversing the path between two (indirectly) connected nodes.</p>
<p>An <span class="inline_code">Edge</span> takes optional parameters <span class="inline_code">stroke</span> (a tuple of <a href="http://en.wikipedia.org/wiki/RGBA">RGBA</a> values between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>) and <span class="inline_code">strokewidth</span>, which can be used to style the graph&nbsp;<a class="link-maintenance" href="#canvas">visualization</a>.</p>
<pre class="brush:python; gutter:false; light:true;">edge = Edge(node1, node2, weight=0.0, length=1.0, type=None, **kwargs)</pre><pre class="brush:python; gutter:false; light:true;">edge.node1 # Node (sender).
edge.node2 # Node (receiver).
edge.weight # Connection strength.
edge.length # Length modifier for the visualization.
edge.type # Useful in semantic networks.
edge.stroke # Default: (0,0,0,1)
edge.strokewidth # Default: 1 </pre><p class="smallcaps"><br />directed graph</p>
<p>An edge can be traversed in both directions: from <span class="inline_code">node1</span><span class="inline_code">node2</span>, and from <span class="inline_code">node2</span><span class="inline_code">node1</span>. The <span class="inline_code">Graph.shortest_path()</span> and <span class="inline_code">Graph.betweenness_centrality()</span> methods have a <span class="inline_code">directed</span> parameter which can be set to <span class="inline_code">True</span>, so that edges are only traversed from <span class="inline_code">node1</span><span class="inline_code">node2</span>. This is called a directed graph. Evidently, it produces different shortest paths and node weights.</p>
<p>Two nodes can be connected by at most two edges (one in each direction). Otherwise, <span class="inline_code">Graph.add_edge()</span> simply returns the edge that already exists between the given nodes.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="graph"></a>Graph</h2>
<p>A <span class="inline_code">Graph</span> is a network of nodes connected by edges, with methods for finding paths between (indirectly) connected nodes.</p>
<pre class="brush:python; gutter:false; light:true;">graph = Graph(layout=SPRING, distance=10.0)</pre><pre class="brush:python; gutter:false; light:true;">graph[id] # Node with given id (Graph is a subclass of dict).
graph.nodes # List of Node objects.
graph.edges # List of Edge objects.
graph.density # &lt; 0.35 =&gt; sparse, &gt; 0.65 =&gt; dense
graph.layout # GraphSpringLayout.
graph.distance # GraphSpringLayout spacing.
</pre><pre class="brush:python; gutter:false; light:true;">graph.add_node(id) # Creates + returns new Node.
graph.add_edge(id1, id2) # Creates + returns new Edge.
graph.remove(node) # Removes given Node + edges.
graph.remove(edge) # Removes given Edge.
graph.prune(depth=0) # Removes nodes + edges if len(node.links) &lt;= depth.
graph.node(id) # Returns node with given id.
graph.edge(id1, id2) # Returns edge connecting the given nodes.
graph.copy(nodes=ALL) # Returns a new Graph.
graph.split() # Returns a list of (unconnected) graphs.
</pre><pre class="brush:python; gutter:false; light:true;">graph.eigenvector_centrality() # Updates all Node.weight values.
graph.betweenness_centrality() # Updates all Node.centrality values. </pre><pre class="brush:python; gutter:false; light:true;">graph.shortest_path(node1, node2, heuristic=None, directed=False)
graph.shortest_paths(node, heuristic=None, directed=False)
graph.paths(node1, node2, length=4)
graph.fringe(depth=0, traversable=lambda node, edge: True)
</pre><pre class="brush:python; gutter:false; light:true;">graph.update(iterations=10, weight=10, limit=0.5)</pre><ul>
<li><span class="inline_code"><span><span class="inline_code">Graph.add_node()</span></span></span> takes an id + any optional parameter of <span><span class="inline_code">Node</span></span>.</li>
<li><span class="inline_code">Graph.add_edge()</span> takes two id's + any optional parameter of <span class="inline_code">Edge</span>.<br />Both methods have an optional <span class="inline_code">base</span> parameter that defines the subclass of <span class="inline_code">Node</span> or <span class="inline_code">Edge</span> to use.</li>
</ul>
<ul>
<li><span class="inline_code">Graph.prune()</span> removes all nodes with less or equal (undirected) connections than <span class="inline_code">depth</span>.</li>
<li><span class="inline_code">Graph.copy()</span> returns a new <span class="inline_code">Graph</span> from the given list of nodes.</li>
<li><span class="inline_code">Graph.split()</span> return a list of unconnected subgraphs.</li>
</ul>
<ul>
<li><span class="inline_code"><span><span class="inline_code">Graph.paths()</span></span></span> returns all paths (each a list of nodes) &lt;= <span class="inline_code">length</span> connecting two given nodes.</li>
<li><span class="inline_code"><span><span class="inline_code">Graph.shortest_path()</span></span></span> returns a list of nodes connecting the two given nodes<span class="inline_code"><span>.</span><br /></span></li>
<li><span class="inline_code">Graph.shortest_paths()</span> returns a dictionary of node <span style="line-height: normal;"></span> shortest path.<br />The optional <span class="inline_code">heuristic</span> function takes two node id's and returns a penalty (<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>) for traversing their edges. With <span class="inline_code">directed=True</span>, edges are only traversable in one direction.</li>
</ul>
<ul>
<li><span class="inline_code">Graph.fringe()</span> returns a list of <em>leaf</em> nodes.<br />With <span class="inline_code">depth=0</span>, returns the nodes with one edge.<br />With <span class="inline_code">depth=1</span>, returns the nodes with one edge + the connected nodes, etc.</li>
</ul>
<p>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.graph import Graph
&gt;&gt;&gt;
&gt;&gt;&gt; g = Graph()
&gt;&gt;&gt; for n1, n2 in (
&gt;&gt;&gt; ('cat', 'tail'), ('cat', 'purr'), ('purr', 'sound'),
&gt;&gt;&gt; ('dog', 'tail'), ('dog', 'bark'), ('bark', 'sound')):
&gt;&gt;&gt; g.add_node(n1)
&gt;&gt;&gt; g.add_node(n2)
&gt;&gt;&gt; g.add_edge(n1, n2, weight=0.0, type='is-related-to')
&gt;&gt;&gt;
&gt;&gt;&gt; for n in sorted(g.nodes, key=lambda n: n.weight):
&gt;&gt;&gt; print '%.2f' % n.weight, n
0.00 Node(id='cat')
0.00 Node(id='dog')
0.07 Node(id='purr')
0.07 Node(id='bark')
0.15 Node(id='tail')
1.00 Node(id='sound')
</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; for n in g.shortest_path('purr', 'bark'):
&gt;&gt;&gt; print n
Node(id='purr')
Node(id='sound')
Node(id='bark')
</pre></div>
<table border="0">
<tbody>
<tr>
<td>
<p>When sorted by <span class="inline_code">Node.weight</span> (i.e., eigenvector centrality), <em>sound</em> is the most important node in the network. This can be explained by observing the visualization on the right. Most nodes (indirectly) connect to <em>sound</em> or <em>tail</em>. No nodes connect to <em>dog</em> or <em>cat</em>, so these are the least important in the network (weight <span class="inline_code">0.0</span>).</p>
<p>By default, nodes with a higher height will have a larger radius in the visualization.</p>
</td>
<td><img src="../g/pattern_graph3.jpg" alt="" width="170" height="155" /></td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<hr />
<h2><a name="layout"></a>Graph layout</h2>
<p>A <span class="inline_code">GraphLayout</span> updates node positions (<span class="inline_code">Node.x</span>, <span class="inline_code">Node.y</span>) iteratively each time <span class="inline_code">GraphLayout.update()</span> is called. The pattern.graph module currently has one implementation: <span class="inline_code">GraphSpringLayout</span>, which uses a force-based algorithm where edges are regarded as springs. Connected nodes are pulled closer together (attraction) while other nodes are pushed further apart (repulsion).</p>
<pre class="brush:python; gutter:false; light:true;">layout = GraphSpringLayout(graph)</pre><pre class="brush:python; gutter:false; light:true;">layout.graph # Graph owner.
layout.iterations # Starts at 0, +1 each update().
layout.bounds # (x, y, width, height)-tuple.</pre><pre class="brush:python; gutter:false; light:true;">layout.k # Force constant (4.0)
layout.force # Force multiplier (0.01)
layout.repulsion # Maximum repulsion radius (50)</pre><pre class="brush:python; gutter:false; light:true;">layout.update(weight=10.0, limit=0.5) # weight = Edge.weight multiplier.
layout.reset()
layout.copy(graph)</pre><p><span class="small"><span style="text-decoration: underline;">Reference</span>: Hellesoy, A. &amp; Hoover, D. (2006). http://ajaxian.com/archives/new-javascriptcanvas-graph-library</span></p>
<p>&nbsp;</p>
<hr />
<h2><a name="utility"></a>Graph adjacency</h2>
<p>The pattern.graph has a number of functions that can be used to modify graph edges:</p>
<pre class="brush:python; gutter:false; light:true;">unlink(graph, node1, node2=None)</pre><pre class="brush:python; gutter:false; light:true;">redirect(graph, node1, node2)</pre><pre class="brush:python; gutter:false; light:true;">cut(graph, node)</pre><pre class="brush:python; gutter:false; light:true;">insert(graph, node, a, b)</pre><ul>
<li style="margin-bottom: 0.3em;"><span class="inline_code">unlink()</span> removes the edge between <span class="inline_code">node1</span> and <span class="inline_code">node2</span>. <br />If only <span class="inline_code">node1</span> is given, removes all edges to + from it. This does not remove <span class="inline_code">node1</span> from the graph.</li>
<li style="margin-bottom: 0.3em;"><span class="inline_code">redirect()</span> connects <span class="inline_code">node1</span>'s edges to <span class="inline_code">node2</span> and removes&nbsp;<span class="inline_code">node1</span>.<br />If <span class="inline_code">A</span>, <span class="inline_code">B</span>, <span class="inline_code">C</span>, <span class="inline_code">D</span> are nodes and <span class="inline_code">A</span><span class="inline_code">B</span> and <span class="inline_code">C</span><span class="inline_code">D</span>, and we redirect <span class="inline_code">A</span> to <span class="inline_code">C</span>, then <span class="inline_code">C</span><span class="inline_code">B</span> and <span class="inline_code">C</span><span class="inline_code">D</span>.</li>
<li style="margin-bottom: 0.3em;"><span class="inline_code">cut()</span> removes the given <span class="inline_code">node</span>&nbsp;and connects the surrounding nodes. <br />If <span class="inline_code">A</span>, <span class="inline_code">B</span>, <span class="inline_code">C</span>, <span class="inline_code">D</span> are nodes and <span class="inline_code">A</span> <span></span> <span class="inline_code">B</span> and <span class="inline_code">B</span> <span></span> <span class="inline_code">C</span> and <span class="inline_code">B</span> <span></span> <span class="inline_code">D</span>, and we cut <span class="inline_code">B</span>, then <span class="inline_code">A</span> <span></span> <span class="inline_code">C</span> and <span class="inline_code">A</span> <span></span> <span class="inline_code">D</span>.</li>
<li><span class="inline_code">insert()</span> inserts the given <span class="inline_code">node</span> between node <span class="inline_code">a</span> and node <span class="inline_code">b</span>. <br />If <span class="inline_code">A</span>, <span class="inline_code">B</span>, <span class="inline_code">C</span> are nodes and <span class="inline_code">A</span> <span></span> <span class="inline_code">B</span>, and we insert <span class="inline_code">C</span>, then <span class="inline_code">A</span> <span></span> <span class="inline_code">C</span> and <span class="inline_code">C</span> <span></span> <span class="inline_code">B</span>.</li>
</ul>
<h3>Edge adjacency map</h3>
<p><span style="font-variant: normal;">The <span class="inline_code">adjacency()</span> function returns a map of linked nodes:</span><span class="smallcaps"><br /></span></p>
<pre class="brush:python; gutter:false; light:true;">adjacency(graph,
directed = False,
reversed = False,
stochastic = False,
heuristic = lambda node1, node2: 0)</pre><p>The return value is an&nbsp;<span class="inline_code">{id1:</span> <span class="inline_code">{id2:</span> <span class="inline_code">weight}}</span>&nbsp;dictionary with <span class="inline_code">Node.id</span>'s as keys, where each value is a dictionary of connected&nbsp;<span class="inline_code">Node.id</span>'s&nbsp;<span style="line-height: 18px;"></span>&nbsp;<span class="inline_code">Edge.weight</span>.</p>
<p>If <span class="inline_code">directed=True</span>, edges are only traversable in one direction. If <span class="inline_code">stochastic=True</span>, the edge weights for all neighbors of a given node sum to <span class="inline_code">1.0</span>.&nbsp;The optional <span class="inline_code">heuristic</span> function takes two node id's and returns an additional cost (<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>) for traversing their edges.&nbsp;</p>
<h3>Edge traversal</h3>
<p>The <span class="inline_code">bfs()</span> function (breadth-first search) visits all nodes connected to the given <span class="inline_code">node</span>. <br />The <span class="inline_code">dfs()</span> function (depth-first search) visits all nodes connected to the given <span class="inline_code">node</span> depth-first, i.e., as far as possible along each path before backtracking.</p>
<pre class="brush:python; gutter:false; light:true;">bfs(node, visit=lambda node: False, traversable=lambda node, edge: True)</pre><pre class="brush:python; gutter:false; light:true;">dfs(node, visit=lambda node: False, traversable=lambda node, edge: True)
</pre><p>The given&nbsp;<span class="inline_code">visit</span>&nbsp;function is called with each visited node. Traversal will stop if it returns <span class="inline_code">True</span>, and subsequently <span class="inline_code">bfs()</span> or <span class="inline_code">dfs()</span> will return <span class="inline_code">True</span>.</p>
<p>The given&nbsp;<span class="inline_code">traversable</span> function takes the visited&nbsp;<span class="inline_code">Node</span> and an&nbsp;<span class="inline_code">Edge</span> and returns <span class="inline_code">True</span> if we are allowed to follow this connection to the next node. For example, the traversable for directed edges:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; def directed(node, edge):
&gt;&gt;&gt; return node.id == edge.node1.id
&gt;&gt;&gt;
&gt;&gt;&gt; dfs(g, traversable=directed) </pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="canvas"></a>Visualization</h2>
<p>The pattern.graph module has a JavaScript counterpart (graph.js) that can be used to visualize a graph in a web page, as a&nbsp;HTML&nbsp;&lt;canvas&gt; element. The HTML &lt;canvas&gt; element allows dynamic, scriptable rendering of 2D shapes and bitmap images (see also Pattern's&nbsp;<a class="link-maintenance" href="pattern-canvas.html">canvas.js</a>).</p>
<p><span class="inline_code">Graph.export(</span>) creates a new file folder at the given <span class="inline_code">path</span>&nbsp;with an index.html (the visualization), a style.css, graphs.js and canvas.js. The optional parameter <span class="inline_code">javascript</span>&nbsp;defines the URL path to graph.js and canvas.js (which will not be included in this case).</p>
<pre class="brush:python; gutter:false; light:true;">graph.export(path, encoding='utf-8', **kwargs)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.graph import Graph
&gt;&gt;&gt;
&gt;&gt;&gt; g = Graph()
&gt;&gt;&gt; for n1, n2 in (
&gt;&gt;&gt; ('cat', 'tail'), ('cat', 'purr'), ('purr', 'sound'),
&gt;&gt;&gt; ('dog', 'tail'), ('dog', 'bark'), ('bark', 'sound')):
&gt;&gt;&gt; g.add_node(n1)
&gt;&gt;&gt; g.add_node(n2)
&gt;&gt;&gt; g.add_edge(n1, n2, weight=0.0, type='is-related-to')
&gt;&gt;&gt;
&gt;&gt;&gt; g.export('sound', directed=True)</pre></div>
<p>Nodes and edges will be styled according to their <span class="inline_code">fill</span>, <span class="inline_code">stroke</span>, and <span class="inline_code">strokewidth</span>&nbsp;properties.</p>
<p>The following parameters can be used to customize the visualization:</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Parameter</span></td>
<td><span class="smallcaps">Default</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td><span class="inline_code">javascript</span></td>
<td><span class="inline_code">''</span></td>
<td>Path to canvas.js&nbsp;and graph.js.</td>
</tr>
<tr>
<td><span class="inline_code">stylesheet</span></td>
<td class="inline_code"><span class="inline_code">INLINE</span></td>
<td>Path to CSS: INLINE,&nbsp;<span class="inline_code">DEFAULT</span>&nbsp;(generates style.css),&nbsp;<span class="inline_code">None</span>&nbsp;or path.</td>
</tr>
<tr>
<td><span class="inline_code">title</span></td>
<td><span class="inline_code">'Graph'</span></td>
<td>HTML&nbsp;<span class="inline_code"><span><span class="inline_code">&lt;title&gt;Graph&lt;/title&gt;</span>.</span></span></td>
</tr>
<tr>
<td><span class="inline_code">id</span></td>
<td><span class="inline_code">'graph'</span></td>
<td>HTML&nbsp;<span class="inline_code">&lt;div</span> <span class="inline_code">id="graph"&gt;</span>&nbsp;contains the&nbsp;<span class="inline_code">&lt;canvas&gt;</span>.</td>
</tr>
<tr>
<td style="border: 0; font-size: 0.5em;">&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">ctx</span></td>
<td><span class="inline_code">'canvas.element'</span></td>
<td>HTML <span class="inline_code">&lt;canvas&gt;</span> element to use for drawing.</td>
</tr>
<tr>
<td><span class="inline_code">width</span></td>
<td><span class="inline_code">700</span></td>
<td>Canvas width in pixels.</td>
</tr>
<tr>
<td><span class="inline_code">height</span></td>
<td><span class="inline_code">500</span></td>
<td>Canvas height in pixels.</td>
</tr>
<tr>
<td><span class="inline_code">frames</span></td>
<td><span class="inline_code">500</span></td>
<td>Number of frames of animation.</td>
</tr>
<tr>
<td><span class="inline_code">ipf</span></td>
<td><span class="inline_code">2</span></td>
<td><span class="inline_code">GraphLayout.update()</span> iterations per frame.</td>
</tr>
<tr>
<td style="border: 0; font-size: 0.5em;">&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">directed</span></td>
<td><span class="inline_code">False</span></td>
<td>Visualize eigenvector centrality as an edge arrow?</td>
</tr>
<tr>
<td><span class="inline_code">weighted</span></td>
<td><span class="inline_code">False</span></td>
<td>Visualize betweenness centrality as a node shadow?</td>
</tr>
<tr>
<td><span class="inline_code">pack</span></td>
<td><span class="inline_code">True</span></td>
<td>Shorten leaf edges + add node weight to node radius.</td>
</tr>
<tr>
<td style="border: 0; font-size: 0.5em;">&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">distance</span></td>
<td><span class="inline_code">graph.distance</span></td>
<td>Average edge length.</td>
</tr>
<tr>
<td><span class="inline_code">k</span></td>
<td><span class="inline_code">graph.k</span></td>
<td>Force constant.</td>
</tr>
<tr>
<td><span class="inline_code">force</span></td>
<td><span class="inline_code">graph.force</span></td>
<td>Force dampener.</td>
</tr>
<tr>
<td><span class="inline_code">repulsion</span></td>
<td><span class="inline_code">graph.repulsion</span></td>
<td>Force radius.</td>
</tr>
<tr>
<td style="border: 0; font-size: 0.5em;">&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">href</span></td>
<td><span class="inline_code">{}</span></td>
<td>Dictionary of <span class="inline_code">Node.id</span> =&gt; URL.</td>
</tr>
<tr>
<td><span class="inline_code">css</span></td>
<td><span class="inline_code">{}</span></td>
<td>Dictionary of <span class="inline_code">Node.id</span> =&gt; CSS classname.</td>
</tr>
</tbody>
</table>
<p>To export a static visualization, use <span class="inline_code">frames=1</span> and <span class="inline_code">ipf=0</span>.<br />&nbsp;</p>
<p class="smallcaps">Server-side scripting</p>
<p><span class="inline_code">Graph.serialize()</span> returns a string with (a portion of) the HTML, CSS and JavaScript source code of the visualization. It can be used to serve a dynamic web page.&nbsp;With <span class="inline_code">type=CANVAS</span>, it returns a HTML string with a <span class="inline_code">&lt;div</span> <span class="inline_code">id="graph"&gt;</span>&nbsp;that contains the canvas.js animation.&nbsp;With <span class="inline_code">type=DATA</span>, it returns a Javascript string that initializes the <span class="inline_code">Graph</span> in variable&nbsp;<span class="inline_code">g</span>&nbsp;(which will draw to <span class="inline_code">ctx</span>).</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">graph.serialize(type=HTML, **kwargs) # HTML | CSS | CANVAS | DATA</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; import cherrypy
&gt;&gt;&gt;
&gt;&gt;&gt; class Visualization(object):
&gt;&gt;&gt; def index(self):
&gt;&gt;&gt; return (
&gt;&gt;&gt; '&lt;html&gt;'
&gt;&gt;&gt; '&lt;head&gt;'
&gt;&gt;&gt; '&lt;script src="canvas.js"&gt;&lt;/script&gt;'
&gt;&gt;&gt; '&lt;script src="graph.js"&gt;&lt;/script&gt;'
&gt;&gt;&gt; '&lt;/head&gt;'
&gt;&gt;&gt; '&lt;body&gt;' + g.serialize(CANVAS, directed=True) +
&gt;&gt;&gt; '&lt;/body&gt;'
&gt;&gt;&gt; '&lt;/html&gt;'
&gt;&gt;&gt; )
&gt;&gt;&gt; index.exposed = True
&gt;&gt;&gt;
&gt;&gt;&gt; cherrypy.quickstart(Visualization())</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="javascript"></a>graph.js</h2>
<p>Below is a standalone demonstration of graph.js, without using&nbsp;<span class="inline_code">export()</span> or canvas.js. The <span class="inline_code">Graph.loop()</span> method fires the spring layout algorithm&nbsp;(<span class="link-maintenance"><a href="http://www.clips.ua.ac.be/media/pattern-graph/random" target="_blank">view live demo</a></span>).</p>
<p><img class="border" src="../g/pattern_graph4.jpg" alt="" width="610" height="390" /></p>
<div class="example">
<pre class="brush:xml; gutter:false; light:true;">&lt;!doctype html&gt;
&lt;html&gt;
&lt;head&gt;
&lt;meta charset="utf-8"&gt;
&lt;style&gt;
#graph { display: block; position: relative; overflow: hidden; }
#graph .node-label { font: 11px sans-serif; }
&lt;/style&gt;
&lt;script src="graph.js"&gt;&lt;/script&gt;
&lt;script&gt;
</pre></div>
<div class="example">
<pre class="brush: jscript;gutter: false; light: true; fontsize: 100; first-line: 1; ">&nbsp;&nbsp;&nbsp;&nbsp;function spring() {
SHADOW = 0.65 // slow...
g = new Graph(document.getElementById("_ctx"));
// Random nodes.
for (var i=0; i &lt; 50; i++) {
g.addNode(i+1);
}
// Random edges.
for (var j=0; j &lt; 75; j++) {
var n1 = choice(g.nodes);
var n2 = choice(g.nodes);
g.addEdge(n1, n2, {weight: Math.random()});
}
g.prune(0);
g.betweennessCentrality();
g.eigenvectorCentrality();
g.loop({frames:500, fps:30, ipf:2, weighted:0.5, directed:true});
}
</pre></div>
<div class="example">
<pre class="brush:xml; gutter:false; light:true;"> &lt;/script&gt;
&lt;/head&gt;
&lt;body onload="spring();"&gt;
&lt;div id="graph" style="width:700px; height:500px;"&gt;
&lt;canvas id="_ctx" width="700" height="500"&gt;&lt;/canvas&gt;
&lt;/div&gt;
&lt;/body&gt;
&lt;/html&gt; </pre></div>
<p>&nbsp;</p>
<hr />
<h2>See also</h2>
<ul>
<li><a href="http://gephi.org/" target="_blank">Gephi</a> (GPL): ne<span>twork analysis &amp; visualization GUI.</span></li>
<li><a href="http://networkx.lanl.gov/" target="_blank">NetworkX</a> (BSD): <span>network analysis toolkit for Python + NumPy.</span></li>
<li><a href="http://www.cityinabottle.org/nodebox/" target="_blank">NodeBox</a> (BSD): g<span>raphics toolkit for Python + OpenGL.</span></li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,613 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-it</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-it" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-it</a></div>
<h1>pattern.it</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1698" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">The pattern.it module contains a fast part-of-speech tagger for Italian (identifies nouns, adjectives, verbs, etc. in a sentence) and tools for Italian verb conjugation and noun singularization &amp; pluralization.</span></p>
<p>It can be used by itself or with other&nbsp;<a href="pattern.html">pattern</a>&nbsp;modules:&nbsp;<a href="pattern-web.html">web</a>&nbsp;|&nbsp;<a href="pattern-db.html">db</a>&nbsp;| <a href="pattern-en.html">en</a>&nbsp;|&nbsp;<a href="pattern-search.html">search</a>&nbsp;|&nbsp;<a href="pattern-vector.html">vector</a>&nbsp;|&nbsp;<a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema_it.gif" alt="" /></p>
<hr />
<h2>Documentation</h2>
<p>The functions in this module take the same parameters and return the same values as their counterparts in <a href="pattern-en.html">pattern.en</a>. Refer to the documentation there for more details.&nbsp;&nbsp;</p>
<h3>Gender</h3>
<p>Italian nouns and adjectives inflect according to gender. The <span class="inline_code">gender()</span> function predicts the gender (<span class="inline_code">MALE</span>, <span class="inline_code">FEMALE</span>,&nbsp;<span class="inline_code">PLURAL</span>) of&nbsp;a given noun with about 92% accuracy:&nbsp;</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.it import gender, MALE, FEMALE, PLURAL
&gt;&gt;&gt; print gender('gatti')
(MALE, PLURAL)</pre></div>
<h3>Article</h3>
<p>The <span class="inline_code">article()</span> function returns the article (<span class="inline_code">INDEFINITE</span> or <span class="inline_code">DEFINITE</span>) inflected by gender (e.g., <em><span style="text-decoration: underline;">il</span> gatto</em>&nbsp;<em><span style="text-decoration: underline;">i</span> gatti</em>).</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.it import article, DEFINITE, MALE, PLURAL
&gt;&gt;&gt; print article('gatti', DEFINITE, gender=(MALE, PLURAL))
i</pre></div>
<h3>Noun singularization &amp; pluralization</h3>
<p>For Italian nouns there is <span class="inline_code">singularize()</span> and <span class="inline_code">pluralize()</span>.&nbsp;The implementation is slightly less robust than the English version (accuracy 84% for singularization and 93% for pluralization).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.it import singularize, pluralize
&gt;&gt;&gt;
&gt;&gt;&gt; print singularize('gatti')
&gt;&gt;&gt; print pluralize('gatto')
gatto
gatti </pre></div>
<h3>Verb conjugation</h3>
<p>For Italian verbs there is <span class="inline_code">conjugate()</span>, <span class="inline_code">lemma()</span>, <span class="inline_code">lexeme()</span> and <span class="inline_code">tenses()</span>.&nbsp;The lexicon for verb conjugation contains about 1,250 common Italian verbs, mined from Wiktionary. For unknown verbs it will fall back to a rule-based approach with an accuracy of about 86%.&nbsp;</p>
<p>Italian verbs have more tenses than English verbs. In particular, the plural differs for each person, and there are additional forms for the&nbsp;<span class="inline_code">FUTURE</span>&nbsp;tense, the&nbsp;<span class="inline_code">IMPERATIVE</span>, <span class="inline_code">CONDITIONAL</span> and&nbsp;<span class="inline_code">SUBJUNCTIVE</span>&nbsp;mood and the&nbsp;<span class="inline_code">PERFECTIVE</span>&nbsp;aspect:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.it import conjugate
&gt;&gt;&gt; from pattern.it import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE, PERFECTIVE
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('sono', INFINITIVE)
&gt;&gt;&gt; print conjugate('sono', PRESENT, 1, SG, mood=SUBJUNCTIVE)
&gt;&gt;&gt; print conjugate('sono', PAST, 3, SG)
&gt;&gt;&gt; print conjugate('sono', PAST, 3, SG, aspect=PERFECTIVE)
essere
sia
era
fu </pre></div>
<p>For <span class="inline_code">PAST</span>&nbsp;tense + <span class="inline_code">PERFECTIVE</span>&nbsp;aspect we can also use <span class="inline_code">PRETERITE</span>&nbsp;(<em>passato remoto</em>) For <span class="inline_code">PAST</span>&nbsp;tense + <span class="inline_code">IMPERFECTIVE</span>&nbsp;aspect we can also use <span class="inline_code">IMPERFECT</span>&nbsp;(<em>imperfetto</em>).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.it import conjugate
&gt;&gt;&gt; from pattern.it import IMPERFECT, PRETERITE
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('sono', IMPERFECT, 3, SG)
&gt;&gt;&gt; print conjugate('sono', PRETERITE, 3, SG)
era
fu </pre></div>
<p>&nbsp;The <span class="inline_code">conjugate()</span> function takes the following optional parameters:</p>
<table class="border">
<tbody>
<tr>
<td class="smallcaps">Tense</td>
<td class="smallcaps">Person</td>
<td class="smallcaps">Number</td>
<td class="smallcaps">Mood</td>
<td class="smallcaps">Aspect</td>
<td class="smallcaps">Alias</td>
<td class="smallcaps">Example</td>
</tr>
<tr>
<td class="inline_code">INFINITVE</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">"inf"</td>
<td><em>essere</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg"</td>
<td><em>io&nbsp;<span style="text-decoration: underline;">sono</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">sei</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg"</td>
<td><em>lui&nbsp;<span style="text-decoration: underline;">è</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl"</td>
<td><em>noi&nbsp;<span style="text-decoration: underline;">siamo</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl"</td>
<td><em>voi&nbsp;<span style="text-decoration: underline;">siete</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl"</td>
<td><em>loro&nbsp;<span style="text-decoration: underline;">sono</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"part"</td>
<td><em>essendo</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg!"</td>
<td><em>sii</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg!"</td>
<td><em>sia</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl!"</td>
<td><em>siamo</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl!"</td>
<td><em>siate</em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">IMPERATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl!"</td>
<td><em>siano</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg?"</td>
<td><em>io&nbsp;<span style="text-decoration: underline;">sia</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg?"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">sia</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg?"</td>
<td><em>lui&nbsp;<span style="text-decoration: underline;">sia</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl?"</td>
<td><em>noi&nbsp;<span style="text-decoration: underline;">siamo</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl?"</td>
<td><em>voi&nbsp;<span style="text-decoration: underline;">siate</span></em></td>
</tr>
<tr>
<td class="inline_code">PRESENT</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl?"</td>
<td><em>loro&nbsp;<span style="text-decoration: underline;">siano</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp"</td>
<td><em>io&nbsp;<span style="text-decoration: underline;">ero</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">eri</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp"</td>
<td><em>lui&nbsp;<span style="text-decoration: underline;">era</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl"</td>
<td><em>noi <span style="text-decoration: underline;">e</span><span style="text-decoration: underline;">ravamo</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl"</td>
<td><em>voi&nbsp;<span style="text-decoration: underline;">eravate</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl"</td>
<td><em>loro&nbsp;<span style="text-decoration: underline;">erano</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">None</td>
<td class="inline_code">None</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PROGRESSIVE</td>
<td class="inline_code">"ppart"</td>
<td><em>stato</em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"1sgp+"</td>
<td><em>io&nbsp;<span style="text-decoration: underline;">fui</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"2sgp+"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">fosti</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"3sgp+"</td>
<td><em>lui&nbsp;<span style="text-decoration: underline;">fu</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"1ppl+"</td>
<td><em>noi&nbsp;<span style="text-decoration: underline;">fummo</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"2ppl+"</td>
<td><em>voi&nbsp;<span style="text-decoration: underline;">foste</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">PERFECTIVE</td>
<td class="inline_code">"3ppl+"</td>
<td><em>loro&nbsp;<span style="text-decoration: underline;">furono</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgp?"</td>
<td><em>io&nbsp;<span style="text-decoration: underline;">fossi</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgp?"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">fossi</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgp?"</td>
<td><em>lui&nbsp;<span style="text-decoration: underline;">fosse</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1ppl?"</td>
<td><em>noi&nbsp;<span style="text-decoration: underline;">fossimo</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2ppl?"</td>
<td><em>voi&nbsp;<span style="text-decoration: underline;">foste</span></em></td>
</tr>
<tr>
<td class="inline_code">PAST</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">SUBJUNCTIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3ppl?"</td>
<td><em>loro&nbsp;<span style="text-decoration: underline;">fossero</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sgf"</td>
<td><em>io&nbsp;<span style="text-decoration: underline;">sarò</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sgf"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">sarai</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sgf"</td>
<td><em>lui&nbsp;<span style="text-decoration: underline;">sarà</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1plf"</td>
<td><em>noi&nbsp;<span style="text-decoration: underline;">saremo</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2plf"</td>
<td><em>voi&nbsp;<span style="text-decoration: underline;">sarete</span></em></td>
</tr>
<tr>
<td class="inline_code">FUTURE</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3plf"</td>
<td><em>loro&nbsp;<span style="text-decoration: underline;">saranno</span></em></td>
</tr>
<tr>
<td style="border-left: 0; border-right: 0; padding: 0;">&nbsp;</td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">1</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1sg-&gt;"</td>
<td><em>io&nbsp;<span style="text-decoration: underline;">sarei</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">2</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2sg-&gt;"</td>
<td><em>tu&nbsp;<span style="text-decoration: underline;">saresti</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">3</td>
<td class="inline_code">SG</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3sg-&gt;"</td>
<td><em>lui&nbsp;<span style="text-decoration: underline;">sarebbe</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">1</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"1pl-&gt;"</td>
<td><em>noi&nbsp;<span style="text-decoration: underline;">saremmo</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">2</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"2pl-&gt;"</td>
<td><em>voi&nbsp;<span style="text-decoration: underline;">sareste</span></em></td>
</tr>
<tr>
<td class="inline_code">CONDITIONAL</td>
<td class="inline_code">3</td>
<td class="inline_code">PL</td>
<td class="inline_code">INDICATIVE</td>
<td class="inline_code">IMPERFECTIVE</td>
<td class="inline_code">"3pl-&gt;"</td>
<td><em>loro&nbsp;<span style="text-decoration: underline;">sarebbero</span></em></td>
</tr>
</tbody>
</table>
<p>Instead of optional parameters, a single short alias, or&nbsp;<span class="inline_code">PARTICIPLE</span> or <span class="inline_code">PAST+PARTICIPLE</span> can also be given. With no parameters, the infinitive form of the verb is returned.</p>
<h3>Attributive &amp; predicative adjectives&nbsp;</h3>
<p>Italian adjectives inflect with suffixes&nbsp;<span class="inline_code">-o</span>&nbsp;<span class="inline_code">-i</span>&nbsp;(masculine) and&nbsp;<span class="inline_code">-a</span>&nbsp;<span class="inline_code">-e</span>&nbsp;(feminine), with some exceptions &nbsp;(e.g., <em>grande</em>&nbsp;<em>i grandi felini</em>). You can get the base form with the <span class="inline_code">predicative()</span> function. A statistical approach is used with an accuracy of 88%.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.it import attributive
&gt;&gt;&gt; print predicative('grandi')
grande </pre></div>
<h3>Parser</h3>
<p>For parsing there is <span class="inline_code" style="font-family: Courier, monospace; font-size: 12px;">parse()</span>, <span class="inline_code">parsetree()</span> and&nbsp;<span class="inline_code" style="font-family: Courier, monospace; font-size: 12px;">split()</span>. The <span class="inline_code">parse()</span> function annotates words in the given string with their part-of-speech <a class="link-maintenance" href="mbsp-tags.html">tags</a>&nbsp;(e.g., <span class="postag">NN</span> for nouns and <span class="postag">VB</span> for verbs). The <span class="inline_code">parsetree()</span> function takes a string and returns a tree of nested objects (<span class="inline_code">Text</span>&nbsp;<span class="inline_code">Sentence</span>&nbsp;<span class="inline_code">Chunk</span>&nbsp;<span class="inline_code">Word</span>). The <span class="inline_code">split()</span> function takes the output of <span class="inline_code">parse()</span> and returns a <span class="inline_code">Text</span>.&nbsp;See the <span class="inline_code">pattern.en</span> documentation&nbsp;(<span class="link-maintenance" style="color: #78aaff;"><a style="color: #8caaff; outline-style: none !important; outline-width: initial !important; outline-color: initial !important;" href="pattern-en.html#tree">here</a></span>) how to manipulate <span class="inline_code">Text</span>&nbsp;objects.&nbsp;</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.it import parse, split
&gt;&gt;&gt;
&gt;&gt;&gt; s = parse('Il gatto nero faceva le fusa.')
&gt;&gt;&gt; for sentence in split(s):
&gt;&gt;&gt; print sentence
Sentence('Il/DT/B-NP/O gatto/NN/I-NP/O nero/JJ/I-NP/O'
'faceva/VB/B-VP/O'
'le/DT/B-NP/O fusa/NN/I-NP/O ././O/O')
</pre></div>
<p>The parser is mined from Wiktionary.&nbsp;The accuracy is around 92%.</p>
<h3>Sentiment analysis</h3>
<p>There's no&nbsp;<span class="inline_code">sentiment()</span> function for Italian yet.</p>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,531 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-metrics</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-metrics" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-metrics</a></div>
<h1>pattern.metrics</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1405" class="node node-type-page"><div class="node-inner">
<div class="content">
<p style="text-align: left;"><span class="big">The pattern.metrics module is a loose collection of performance, accuracy, similarity and significance tests, including code profiling, precision &amp; recall, inter-rater agreement, text metrics (similarity, readability, intertextuality, cooccurrence) and statistics (variance, chi-squared, goodness of fit).</span></p>
<p>It can be used by itself or with other <a href="pattern.html">pattern</a> modules: <a href="pattern-web.html">web</a> | <a href="pattern-db.html">db</a> | <a href="pattern-en.html">en</a> | <a href="pattern-search.html">search</a> <span class="blue"> </span>| <a href="pattern-vector.html">vector</a> | <a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<ul>
<li><a href="#profile">Profiler</a></li>
<li><a href="#accuracy">Accuracy, precision and recall</a></li>
<li><a href="#agreement">Inter-rater agreement</a> <span class="small link-maintenance">(Fleiss)</span></li>
</ul>
<div class="smallcaps">Text metrics</div>
<ul>
<li><a href="#similarity">Similarity</a> <span class="small link-maintenance">(Levenshtein, Dice)</span></li>
<li><a href="#readability">Readability</a> <span class="small link-maintenance">(Flesch)</span></li>
<li><a href="#ttr">Type-token ratio</a></li>
<li><a href="#intertextuality">Intertextuality</a></li>
<li><a href="#cooccurrence">Cooccurrence</a></li>
</ul>
<div class="smallcaps">Statistics</div>
<ul>
<li><a href="#mean">Mean, variance, standard deviation</a></li>
<li><a href="#gauss">Normal distribution</a></li>
<li><a href="#histogram">Histogram</a></li>
<li><a href="#moment">Moment</a></li>
<li><a href="#quantile">Quantile &amp; box plot</a></li>
</ul>
<ul>
<li><a href="#fisher">Fisher's exact test</a></li>
<li><a href="#chi2">Pearson's chi-squared test</a></li>
<li><a href="#ks2">Kolmogorov-Smirnov test</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="profile"></a>Profiler</h2>
<p>Python is optimized with fast C extensions (e.g., <span class="inline_code">dict</span> traversal, regular expressions). Pattern is optimized with caching mechanisms. The <span class="inline_code">profile()</span> function can be used to test the performance (speed) of your own code. It returns a string with a breakdown of function calls + running time. You can then test the <span class="inline_code">duration()</span> of individual functions and refactor their source code to make them faster.</p>
<pre class="brush:python; gutter:false; light:true;">profile(function, *args, **kwargs) # Returns a string (report).</pre><pre class="brush:python; gutter:false; light:true;">duration(function, *args, **kwargs) # Returns a float (seconds).</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.metrics import profile
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; def main(n=10):
&gt;&gt;&gt; for i in range(n):
&gt;&gt;&gt; parsetree('The cat sat on the mat.')
&gt;&gt;&gt;
&gt;&gt;&gt; print profile(main, n=100)</pre></div>
<table class="border">
<tbody>
<tr>
<td class="smallcaps" style="text-align: center;">ncalls</td>
<td class="smallcaps" style="text-align: center;">tottime</td>
<td class="smallcaps" style="text-align: center;">percall</td>
<td class="smallcaps" style="text-align: center;">cumtime</td>
<td class="smallcaps" style="text-align: center;">percall</td>
<td class="smallcaps">filename:lineno(function)</td>
</tr>
<tr>
<td style="text-align: center;">1</td>
<td style="text-align: center;">0.082</td>
<td style="text-align: center;">0.082</td>
<td style="text-align: center;">1.171</td>
<td style="text-align: center;">1.171</td>
<td>text/__init__.py:229(load)</td>
</tr>
<tr>
<td style="text-align: center;">94,127</td>
<td style="text-align: center;">0.147</td>
<td style="text-align: center;">0.000</td>
<td style="text-align: center;">1.089</td>
<td style="text-align: center;">0.000</td>
<td>text/__init__.py:231(&lt;genexpr&gt;)</td>
</tr>
<tr>
<td style="text-align: center;">94,774</td>
<td style="text-align: center;">0.233</td>
<td style="text-align: center;">0.000</td>
<td style="text-align: center;">0.861</td>
<td style="text-align: center;">0.000</td>
<td>text/__init__.py:195(_read)</td>
</tr>
<tr>
<td style="text-align: center;">95,391</td>
<td style="text-align: center;">0.321</td>
<td style="text-align: center;">0.000</td>
<td style="text-align: center;">0.541</td>
<td style="text-align: center;">0.000</td>
<td>text/__init__.py:33(decode_string)</td>
</tr>
<tr>
<td style="text-align: center;">95,991</td>
<td style="text-align: center;">0.073</td>
<td style="text-align: center;">0.000</td>
<td style="text-align: center;">0.182</td>
<td style="text-align: center;">0.000</td>
<td>{_codecs.utf_8_decode}</td>
</tr>
</tbody>
</table>
<p>In this example, the pattern.en parser spends most of its time loading data files and decoding Unicode.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="accuracy"></a>Accuracy, precision and recall</h2>
<p>Precision and recall can be used to test the performance (accuracy) of a binary classifier. A well-known classification task is spam detection, for example an <span class="inline_code">is_spam()</span> function that yields <span class="inline_code">True</span> or <span class="inline_code">False</span> (binary). Accuracy is a measure of how many times the function yields <span class="inline_code">True</span> for spam messages (= true positives, "hits"). Occasionally, the function might also return <span class="inline_code">True</span> for messages that are not spam (= false positives, "errors"), or <span class="inline_code">False</span> for messages that <em>are</em> spam (= false negatives, "misses").</p>
<p><strong>Precision</strong> is a measure of hits vs. errors. <strong>Recall</strong> is a measure of hits vs. misses. High precision means that actual e-mail does not end up in the junk folder. High recall means that no spam ends up in the inbox.</p>
<p>The <span class="inline_code">confusion_matrix()</span> function takes a function that returns <span class="inline_code">True</span> or <span class="inline_code">False</span> for a given document (e.g., a string), and a list of <span class="inline_code">(document,</span> <span class="inline_code">bool)</span>-tuples for testing. It returns a <span class="inline_code">(TP,</span> <span class="inline_code">TN,</span> <span class="inline_code">FP,</span> <span class="inline_code">FN)</span>-tuple.</p>
<p>The <span class="inline_code">test()</span> function takes a function and a list of <span class="inline_code">(document,</span> <span class="inline_code">bool)</span>-tuples. It returns a tuple with <span class="inline_code">(accuracy,</span> <span class="inline_code">precision,</span> <span class="inline_code">recall,</span> <span class="inline_code">F1-score)</span>. The optional <span class="inline_code">average</span> can be <span class="inline_code">MACRO</span> or <span class="inline_code">None</span>.</p>
<pre class="brush:python; gutter:false; light:true;">confusion_matrix(match=lambda document: False, documents=[(None, False)])</pre><pre class="brush:python; gutter:false; light:true;">test(match=lambda document:False, documents=[], average=None)</pre><table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Metric</span></td>
<td><span class="smallcaps">Formula</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td>Accuracy</td>
<td><span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">TN)</span> <span class="inline_code">/</span> <span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">TN</span> <span class="inline_code">+</span> <span class="inline_code">FP</span> <span class="inline_code">+</span> <span class="inline_code">FN)</span></td>
<td>percentage of correct classifications</td>
</tr>
<tr>
<td>Precision</td>
<td><span class="inline_code">TP</span> <span class="inline_code">/</span> <span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">FP)</span></td>
<td>percentage of correct positive classifications</td>
</tr>
<tr>
<td>Recall</td>
<td><span class="inline_code">TP</span> <span class="inline_code">/</span> <span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">FN)</span></td>
<td>percentage of positive cases correctly classified as positive</td>
</tr>
<tr>
<td>F1-score</td>
<td><span class="inline_code">2</span> <span class="inline_code">x</span> <span class="inline_code">P</span> <span class="inline_code">x</span> <span class="inline_code">R</span> <span class="inline_code">/</span> <span class="inline_code">(P</span> <span class="inline_code">+</span> <span class="inline_code">R)</span></td>
<td>harmonic mean of precision and recall</td>
</tr>
</tbody>
</table>
<p>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.metrics import confusion_matrix, test
&gt;&gt;&gt;
&gt;&gt;&gt; def is_spam(s):
&gt;&gt;&gt; s = (w.strip(',.?!"') for w in s.lower().split())
&gt;&gt;&gt; return any(w in ('viagra', 'lottery') for w in s)
&gt;&gt;&gt;
&gt;&gt;&gt; data = [
&gt;&gt;&gt; ('In attachment is the final report.', False),
&gt;&gt;&gt; ('Here is that link we talked about.', False),
&gt;&gt;&gt; ("Don't forget to buy more cat food!", False),
&gt;&gt;&gt; ("Shouldn't is_spam() flag 'viagra'?", False),
&gt;&gt;&gt; ('You are the winner in our lottery!', True),
&gt;&gt;&gt; ('VIAGRA PROFESSIONAL as low as 1.4$', True)
&gt;&gt;&gt; ]
&gt;&gt;&gt; print confusion_matrix(is_spam, data)
&gt;&gt;&gt; print test(is_spam, data)
(2, 3, 1, 0)
(0.83, 0.67, 1.00, 0.80) </pre></div>
<p>In this example, <span class="inline_code">is_spam()</span> correctly classifies 5 out of 6 messages (83% accuracy). It identifies all spam messages (100% recall). However, it also flags a<em> </em>message that is not spam (67% precision).</p>
<p>&nbsp;</p>
<hr />
<h2><a name="agreement"></a>Inter-rater agreement</h2>
<p>Inter-rater agreement (Fleiss' kappa) can be used to test the consensus among different raters. For example, say we have an <span class="inline_code">is_spam()</span> function that predicts whether a given e-mail message is spam or not. It uses a list of words, each annotated with a "junk score" between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>. To avoid bias, each score is the average of the ratings of three different annotators. The annotators agree on obvious words such as <em>viagra</em> (everyone says <span class="inline_code">1.0</span>), but their ratings diverge on ambiguous words. So how <em>reliable</em> is the list?</p>
<p>The <span class="inline_code">agreement()</span> function returns the reliability as a number between <span class="inline_code">-1.0</span> and <span class="inline_code">+1.0</span> (where <span class="inline_code">+0.7</span> is reliable). The given <span class="inline_code">matrix</span> is a list in which each row represents a task. Each task is a list with the number of votes per rating. Each column represents a possible rating.</p>
<pre class="brush:python; gutter:false; light:true;">agreement(matrix)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.metrics import agreement
&gt;&gt;&gt;
&gt;&gt;&gt; m = [ # 0.0 0.5 1.0 JUNK?
&gt;&gt;&gt; [ 0, 0, 3 ], # viagra
&gt;&gt;&gt; [ 0 1, 2 ], # lottery
&gt;&gt;&gt; [ 1, 2, 0 ], # buy
&gt;&gt;&gt; [ 3, 0, 0 ], # cat
&gt;&gt;&gt; ]
&gt;&gt;&gt; print agreement(m)
0.49</pre></div>
<p>Although the annotators disagree on ambiguous words such as <em>buy</em> (one says <span class="inline_code">0.0</span>, the others say <span class="inline_code">0.5</span>), the list is quite reliable (<span class="inline_code">+0.49</span> agreement). The averaged score for <em>buy</em> will be <span class="inline_code">0.33</span>.</p>
<p>&nbsp;</p>
<hr />
<h2>Text metrics</h2>
<h3><a name="similarity"></a>Similarity</h3>
<p>The <span class="inline_code">similarity()</span> function can be used to test the similarity between two strings. It returns a value between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>. The optional <span class="inline_code">metric</span> can be <span class="inline_code">LEVENSHTEIN</span> or <span class="inline_code">DICE</span>. Levenshtein edit distance measures the similarity between two strings as the number of operations (insert, delete, replace) needed to transform one string into the other (e.g., <em>cat</em><em>hat</em><em>what</em>). Dice coefficient measures the similarity as the number of shared bigrams (e.g., <em>nap</em> and <em>trap</em> share one bigram <em>ap</em>).</p>
<pre class="brush:python; gutter:false; light:true;">similarity(string1, string2, metric=LEVENSHTEIN)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import similarity, levenshtein
&gt;&gt;&gt;
&gt;&gt;&gt; print similarity('cat', 'what')
&gt;&gt;&gt; print levenshtein('cat', 'what')
0.5
2</pre></div>
<h3><a name="readability"></a>Readability</h3>
<p>The <span class="inline_code">readability()</span> function can be used to test the readability of a text. It returns a value between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>, based on Flesch Reading Ease, which measures word count and word length (= number of syllables per word).</p>
<pre class="brush:python; gutter:false; light:true;">readibility(string)</pre><table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Readability</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td><span class="inline_code">0.9-1.0</span></td>
<td>easily understandable by 11-year olds</td>
</tr>
<tr>
<td><span class="inline_code">0.6-0.7</span></td>
<td>easily understandable by 13 to 15-year olds</td>
</tr>
<tr>
<td><span class="inline_code">0.3-0.5</span></td>
<td>best understood by university graduates</td>
</tr>
</tbody>
</table>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import readability
&gt;&gt;&gt;
&gt;&gt;&gt; dr_seuss = "\n".join((
&gt;&gt;&gt; "'I know some good games we could play,' said the cat.",
&gt;&gt;&gt; "'I know some new tricks,' said the cat in the hat.",
&gt;&gt;&gt; "'A lot of good tricks. I will show them to you.'",
&gt;&gt;&gt; "'Your mother will not mind at all if I do.'"
&gt;&gt;&gt; ))
&gt;&gt;&gt; print readability(dr_seuss)
0.908 </pre></div>
<h3><a name="ttr"></a>Type-token ratio</h3>
<p>The <span class="inline_code">ttr()</span> function can be used to test the lexical diversity of a text. It returns a value between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>, which is the average percentage of unique words (types) for each <span class="inline_code">n</span> successive words (tokens) in the text.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">ttr(string, n=100, punctuation='.,;:!?()[]{}`''\"@#$^&amp;*+-|=~_')</pre><table class="border">
<tbody>
<tr>
<td class="smallcaps">Author</td>
<td class="smallcaps">Text</td>
<td class="smallcaps" style="text-align: center;">Year</td>
<td class="smallcaps">TTR</td>
</tr>
<tr>
<td>Dr. Seuss</td>
<td>The Cat In The Hat</td>
<td style="text-align: center;">1957</td>
<td class="inline_code">0.588</td>
</tr>
<tr>
<td>Lewis Carroll</td>
<td>Alice In Wonderland</td>
<td style="text-align: center;">1865</td>
<td class="inline_code">0.728</td>
</tr>
<tr>
<td>George Washington</td>
<td>First Inaugural Address</td>
<td style="text-align: center;">1789</td>
<td class="inline_code">0.722</td>
</tr>
<tr>
<td>George W. Bush</td>
<td>First Inaugural Address</td>
<td style="text-align: center;">2001</td>
<td class="inline_code">0.704</td>
</tr>
<tr>
<td>Barack Obama</td>
<td>First Inaugural Address</td>
<td style="text-align: center;">2009</td>
<td class="inline_code">0.717</td>
</tr>
</tbody>
</table>
<h3><a name="intertextuality"></a>Intertextuality</h3>
<p>The <span class="inline_code">intertextuality()</span> function can be used to test the overlap between texts (e.g., plagiarism detection). It takes a list of strings and returns a <span class="inline_code">dict</span> with <span class="inline_code">(i,</span> <span class="inline_code">j)</span>-tuples as keys and <span class="inline_code">float</span> values between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>. For indices <span class="inline_code">i</span> and <span class="inline_code">j</span> in the given list, the corresponding <span class="inline_code">float</span> is the percentage of text <span class="inline_code">i</span> that is also in text <span class="inline_code">j</span>. Overlap is measured by <a class="link-maintenance" href="pattern-en.html#ngram"><em>n</em>-grams</a> (by default <span class="inline_code">n=5</span> or five successive words). An optional <span class="inline_code">weight</span> function can be used to supply a weight for each <em>n</em>-gram (e.g., <a class="link-maintenance" href="pattern-vector.html#tf-idf">tf-idf</a>).</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">texts=[], n=5, weight=lambda ngram: 1.0)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import intertextuality
&gt;&gt;&gt; from glob import glob
&gt;&gt;&gt;
&gt;&gt;&gt; index = {}
&gt;&gt;&gt; texts = []
&gt;&gt;&gt; for i, f in enumerate(glob('data/*.txt')):
&gt;&gt;&gt; index[i] = f
&gt;&gt;&gt; texts.append(open(f).read())
&gt;&gt;&gt;
&gt;&gt;&gt; for (i, j), weight in intertextuality(texts, n=3).items():
&gt;&gt;&gt; if weight &gt; 0.1:
&gt;&gt;&gt; print index[i]
&gt;&gt;&gt; print index[j]
&gt;&gt;&gt; print weight
&gt;&gt;&gt; print weight.assessments # Set of overlapping n-grams.
&gt;&gt;&gt; print </pre></div>
<h3><a name="cooccurrence"></a>Cooccurrence</h3>
<p>The <span class="inline_code">cooccurrence()</span> function can be used to test how often words occur alongside each other. It takes an iterable, string, file or list of files, and returns a <span class="inline_code">{word1:</span> <span class="inline_code">{word2:</span> <span class="inline_code">count,</span> <span class="inline_code">word3:</span> <span class="inline_code">count,</span> ...<span class="inline_code">}}</span> dictionary.</p>
<p>A well-known application is distributional semantics. For example, if <em><span style="text-decoration: underline;">cat</span> meows</em> and <em><span style="text-decoration: underline;">cat</span> purrs</em> occur often, <em>meow</em> and <em>purr</em> are probably related to <em>cat</em>, and to each other. This requires a large text corpus (e.g., 10+ million words). For performance, it should be given as an <span class="inline_code">open(path)</span> iterator instead of an <span class="inline_code">open(path).read()</span> string.</p>
<p>The <span class="inline_code">window</span> parameter defines the size of the cooccurrence window, e.g., <span class="inline_code">(-1,</span> <span class="inline_code">-1)</span> means the word to the left of the anchor. The <span class="inline_code">term1</span> function defines which words are anchors (e.g., <em>cat</em>). By default, all words are anchors but this may raise a <span class="inline_code">MemoryError</span>. The <span class="inline_code">term2</span> function defines which co-occuring words to count. The optional <span class="inline_code">normalize</span> function can be used to transform words (e.g., strip punctuation).</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">cooccurrence(iterable, window=(-1, -1),
term1 = lambda w: True,
term2 = lambda w: True,
normalize = lambda w: w)</pre><p>What adjectives occur frequently in front of which nouns?</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import cooccurrence
&gt;&gt;&gt;
&gt;&gt;&gt; f = open('pattern/test/corpora/tagged-en-oanc.txt')
&gt;&gt;&gt; m = cooccurrence(f,
&gt;&gt;&gt; window = (-2, -1),
&gt;&gt;&gt; term1 = lambda w: w[1] == 'NN',
&gt;&gt;&gt; term2 = lambda w: w[1] == 'JJ',
&gt;&gt;&gt; normalize = lambda w: tuple(w.split('/')) # cat/NN =&gt; ('cat', 'NN')
&gt;&gt;&gt; )
&gt;&gt;&gt; for noun in m:
&gt;&gt;&gt; for adjective, count in m[noun].items():
&gt;&gt;&gt; print adjective, noun, count
('last', 'JJ') ('year', 'NN') 31
('next', 'JJ') ('year', 'NN') 10
('past', 'JJ') ('year', 'NN') 7
... </pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="statistics"></a>Statistics</h2>
<h3><a name="mean"></a>Mean, median, variance, standard deviation</h3>
<p>An <strong>average</strong> is a measure of the "center" of a data set (= a list of values). It can be measured in different ways, for example by mean, median or mode. Usually, a data set is a smaller <em>sample</em> of a <em>population</em>. For example, <span class="inline_code">[1</span><span class="inline_code">,2</span><span class="inline_code">,4</span><span class="inline_code">]</span> is a sample of powers of two. The mean is the sum of values divided by the sample size: <span class="inline_code">1</span> + <span class="inline_code">2</span> + <span class="inline_code">4</span> / <span class="inline_code">3</span> = <span class="inline_code">2.37</span>. The median is the middle value in the sorted list of values: <span class="inline_code">2</span>.</p>
<p>Variance measures how a data set is spread out. The square root of variance is called the standard deviation. A low standard deviation indicates that the values are clustered closely around the mean. A high standard deviation indicates that the values are spread out over a large range. The standard deviation can be used to test the reliability of a data set.</p>
<p>For example, for two equally competent sports teams, in which each player has a score, the team with the lower standard deviation is more reliable, since all players perform equally well on average. The team with the higher standard deviation may have very good players and very bad players (e.g., strong offense, weak defense), making their games more unpredictable.</p>
<p>The <span class="inline_code">avg()</span> or <span class="inline_code">mean()</span> function returns the mean. The <span class="inline_code">stdev()</span> function returns the standard deviation:</p>
<pre class="brush:python; gutter:false; light:true;">mean(iterable) # [1, 2, 4] =&gt; 2.33</pre><pre class="brush:python; gutter:false; light:true;">median(iterable) # [1, 2, 4] =&gt; 2</pre><pre class="brush:python; gutter:false; light:true;">variance(iterable, sample=False) # [1, 2, 4] =&gt; 1.56</pre><pre class="brush:python; gutter:false; light:true;">stdev(iterable, sample=False) # [1, 2, 4] =&gt; 1.53</pre><table class="border">
<tbody>
<tr>
<td class="smallcaps">Metric</td>
<td class="smallcaps">Formula</td>
</tr>
<tr>
<td>Mean</td>
<td><span class="inline_code">sum(list)</span> <span class="inline_code">/</span> <span class="inline_code">len(list)</span></td>
</tr>
<tr>
<td>Variance</td>
<td><span class="inline_code">sum((v</span> <span class="inline_code">-</span> <span class="inline_code">mean(list))</span> <span class="inline_code">**</span> <span class="inline_code">2</span> <span class="inline_code">for</span> <span class="inline_code">v</span> <span class="inline_code">in</span> <span class="inline_code">list)</span> <span class="inline_code">/</span> <span class="inline_code">len(list)</span></td>
</tr>
<tr>
<td>Standard deviation</td>
<td class="inline_code">sqrt(variance(list))</td>
</tr>
</tbody>
</table>
<p>To compute the sample variance with <a href="http://en.wikipedia.org/wiki/Bessel%27s_correction" target="_blank">bias correction</a>, i.e., <span class="inline_code">len(list)</span> <span class="inline_code">-</span> <span class="inline_code">1</span>, use <span class="inline_code">sample=True</span>.</p>
<p>We can use the <span class="inline_code">mean()</span> function to implement a generator for the simple moving average (SMA):</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import mean
&gt;&gt;&gt;
&gt;&gt;&gt; def sma(iterable, k=10):
&gt;&gt;&gt; a = list(iterable)
&gt;&gt;&gt; for m in xrange(len(a)):
&gt;&gt;&gt; i = m - k
&gt;&gt;&gt; j = m + k + 1
&gt;&gt;&gt; yield mean(a[max(0,i):j])</pre></div>
<h3><a name="gauss"></a>Normal distribution</h3>
<p>The normal (or Gaussian) distribution is a very common distribution of values. When graphed, it produces a bell-shaped curve. An <em>even</em> or uniform distribution on the other hand produces a straight horizontal line. For example, human intelligence is normally distributed. If we performed an IQ test among 750 individuals, about 2/3 or 250 of the IQ scores would range between IQ 85115, or within one standard deviation (15) of the mean IQ 100. This means that few individuals have an exceptionally low or high IQ.</p>
<table class="border">
<tbody>
<tr>
<td style="text-align: center;">
<p><img style="display: block; margin-left: auto; margin-right: auto;" src="../g/pattern-metrics-bell.jpg" alt="" width="398" height="180" /></p>
<p><span class="smallcaps">distribution of iq scores</span></p></td>
</tr>
</tbody>
</table>
<p>The <span class="inline_code">norm()</span> function returns a list of <span class="inline_code">n</span> random samples from the normal distribution.</p>
<p>The <span class="inline_code">pdf()</span> or probability density function returns the chance (<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>) that a given value occurs in a normal distribution with the given <span class="inline_code">mean</span> and <span class="inline_code">stdev</span>.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">norm(n, mean=0.0, stdev=1.0) </pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">pdf(x, mean=0.0, stdev=1.0)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import pdf
&gt;&gt;&gt; print sum(pdf(iq, mean=100, stdev=15) for iq in range(85, 115))
0.6825 </pre></div>
<h3><a name="histogram"></a>Histogram</h3>
<p>The <span class="inline_code">histogram()</span> function returns a dictionary <span class="inline_code">{(start,</span> <span class="inline_code">stop):</span> <span class="inline_code">[v1,</span> <span class="inline_code">v2,</span> ...<span class="inline_code">]}</span> with the values from the given list grouped into <em>k</em> equal intervals. It is an estimate of the distribution of the data set (e.g., which intervals have the most values).</p>
<pre class="brush:python; gutter:false; light:true;">histogram(iterable, k=10)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.metrics import histogram
&gt;&gt;&gt;
&gt;&gt;&gt; s = [70, 85, 85, 100, 100, 100, 115, 115, 130]
&gt;&gt;&gt; for (i, j), values in sorted(histogram(s, k=5).items()):
&gt;&gt;&gt; m = i + (j - i) / 2 # midpoint
&gt;&gt;&gt; print i, j, m, values
70.0 82.0 76.0 [70]
82.0 94.0 88.0 [85, 85]
94.0 106.0 100.0 [100, 100, 100]
106.0 118.0 112.0 [115, 115]
118.0 130.0 124.0 [130] </pre></div>
<h3><a name="moment"></a>Moment</h3>
<p>The <span class="inline_code">moment()</span> function returns the <em>n</em>-th central moment about the mean, where <span class="inline_code">n=2</span> is variance, <span class="inline_code">n=3</span> skewness and <span class="inline_code">n=4</span> kurtosis. Variance measures how <em>wide</em> the data is spread out. Skewness measures how <em>evenly</em> the data is spread out: <span class="inline_code">&gt;</span> <span class="inline_code">0</span> indicates fewer high values, <span class="inline_code">&lt;</span> <span class="inline_code">0</span> fewer low values. Kurtosis measures how tight the data is near the mean: <span class="inline_code">&gt;</span> <span class="inline_code">0</span> indicates fewer values near the mean (= more extreme values), <span class="inline_code">&lt;</span> <span class="inline_code">0</span> more values near the mean.</p>
<pre class="brush:python; gutter:false; light:true;">moment(iterable, n=2) # n=2 variance | 3 skewness | 4 kurtosis</pre><pre class="brush:python; gutter:false; light:true;">skewness(iterable) # &gt; 0 =&gt; fewer values over mean</pre><pre class="brush:python; gutter:false; light:true;">kurtosis(iterable) # &gt; 0 =&gt; fewer values near mean</pre><p>Skewness and kurtosis are <span class="inline_code">0.0</span> for the normal distribution:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.metrics import skewness
&gt;&gt;&gt; from random import gauss
&gt;&gt;&gt;
&gt;&gt;&gt; print skewness([gauss(100, 15) for i in xrange(100000)])
0.001 </pre></div>
<h3><a name="quantile"></a>Quantile &amp; box plot</h3>
<p>The <span class="inline_code">quantile()</span> function returns the interpolated value at point p (<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>) in a sorted list of values. With <span class="inline_code">p=0.5</span> it returns the median.The parameters <span class="inline_code">a</span>, <span class="inline_code">b</span>, <span class="inline_code">c</span>, <span class="inline_code">d</span> refer to the algorithm by Hyndman and Fan <a href="http://stat.ethz.ch/R-manual/R-patched/library/stats/html/quantile.html" target="_blank">[1]</a>.</p>
<p>The <span class="inline_code">boxplot()</span> function returns a <span class="inline_code">(min,</span> <span class="inline_code">q1,</span> <span class="inline_code">q2,</span> <span class="inline_code">q3,</span> <span class="inline_code">max)</span>-tuple for a given list of values, where <span class="inline_code">q2</span> is the median, <span class="inline_code">q1</span>&nbsp;the quantile with <span class="inline_code">p=0.25</span> and <span class="inline_code">q3</span> the quantile with <span class="inline_code">p=0.75</span>, i.e., the 25-75% range around the median.&nbsp;This can be used to identify outliers. For example, if a sample of temperatures in your house comprises you (37°C), the cat (38°C), the refrigerator (5°C) and the oven (220°C), then the average temperature is 75°C. This of course is incorrect since the oven is an outlier. It lies well outside the 25-75% range.</p>
<pre class="brush:python; gutter:false; light:true;">quantile(iterable, p=0.5, sort=True, a=1, b=-1, c=0, d=1)</pre><pre class="brush:python; gutter:false; light:true;">boxplot(iterable)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.metrics import boxplot
&gt;&gt;&gt; print boxplot([5, 37, 38, 220])
(5.0, 29.0, 37.5, 83.5, 220.0)</pre></div>
<table class="border">
<tbody>
<tr>
<td style="text-align: center;"><img style="display: block; margin-left: auto; margin-right: auto;" src="../g/pattern-metrics-boxplot.jpg" alt="" width="398" height="137" /><span class="smallcaps">you, the cat, the fridge and the oven</span></td>
</tr>
</tbody>
</table>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: Adorio E. (2008) http://adorio-research.org/wordpress/?p=125</span></p>
<p>&nbsp;</p>
<hr />
<h2>Statistical tests</h2>
<h3><a name="fisher"></a>Fisher's exact test</h3>
<p>The <span class="inline_code">fisher()</span> function or <a href="http://en.wikipedia.org/wiki/Fisher's_exact_test">Fisher's exact test</a>&nbsp;can be used to test the contingency of a 2 x 2 classification. It returns probability <span class="inline_code">p</span> between <span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>, where <span class="inline_code">p</span> <span class="inline_code">&lt;</span> <span class="inline_code">0.05</span> is significant and <span class="inline_code">p</span> <span class="inline_code">&lt;</span> <span class="inline_code">0.01</span> is very significant.</p>
<p>Say that 96 pet owners were asked about their pet, and 29/46 men reported owning a dog and 30/50 women reported owning a cat. We have a 2 x 2 classification (cat or dog&nbsp;↔ man or woman)&nbsp;that we assume to be evenly distributed, i.e., we assume that men and women are equally fond of cats and dogs. This is the <em>null hypothesis</em>. But Fisher's exact test yields <span class="inline_code">p</span> <span class="inline_code">0.027</span> <span class="inline_code">&lt;</span> <span class="inline_code">0.05</span>&nbsp;so we need to <em>reject</em> the null hypothesis. There is a significant correlation between gender and pet ownership (women are more fond of cats).</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">fisher(a, b, c, d)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import fisher
&gt;&gt;&gt; print fisher(a=17, b=30, c=29, d=20)
0.027</pre></div>
<table class="border">
<tbody>
<tr>
<td>&nbsp;</td>
<td class="smallcaps" style="text-align: center;">men</td>
<td class="smallcaps" style="text-align: center;">women</td>
</tr>
<tr>
<td class="smallcaps" style="text-align: right;">cat owner</td>
<td style="text-align: center;"><span class="inline_code">17</span><span class="small"> (a)</span></td>
<td style="text-align: center;"><span class="inline_code">30</span><span class="small"> (b)</span></td>
</tr>
<tr>
<td class="smallcaps" style="text-align: right;">dog owner</td>
<td style="text-align: center;"><span class="inline_code">29</span><span class="small"> (c)</span></td>
<td style="text-align: center;"><span class="inline_code">20</span><span class="small"> (d)</span></td>
</tr>
</tbody>
</table>
<p class="small"><span style="text-decoration: underline;">Reference</span>: Edelson, J. &amp; Lester D. (1983). Personality and pet ownership: a preliminary study. <em>Psychological Reports</em>.</p>
<h3><a name="chi2"></a>Chi-squared test</h3>
<p>The <span class="inline_code">chi2()</span> function or <a href="http://en.wikipedia.org/wiki/Pearson's_chi-squared_test">Pearson's chi-squared test</a> can be used to test the contingency of an n x m classification. It returns an <span class="inline_code">(x2,</span>&nbsp;<span class="inline_code">p)</span>-tuple, where probability&nbsp;<span class="inline_code">p</span> <span class="inline_code">&lt;</span> <span class="inline_code">0.05</span> is significant and <span class="inline_code">p</span> <span class="inline_code">&lt;</span> <span class="inline_code">0.01</span> is very significant.&nbsp;The <span class="inline_code">observed</span> matrix is a list of lists of <span class="inline_code">int</span> values (i.e., absolute frequencies).&nbsp;By default, the <span class="inline_code">expected</span> matrix is evenly distributed over all classes, and&nbsp;<span class="inline_code">df</span> is&nbsp;<span class="inline_code">(n-1)</span> <span class="inline_code">*</span> <span class="inline_code">(m-1)</span>&nbsp;degrees of freedom.&nbsp;</p>
<p>Say that 255 pet owners aged 30, 40, 50 or 55+ were asked whether they owned a cat or a dog. We have an n x m classification (cat or dog&nbsp;↔ 30, 40, 50, 55+) that we assume to be evenly distributed, i.e., we assume that pet preference&nbsp;is unrelated to age.&nbsp;This is the <em>null hypothesis</em>. The chi-squared test for the data below yields <span class="inline_code">p</span> <span class="inline_code">0.89</span>&nbsp;<span class="inline_code">&gt;</span> <span class="inline_code">0.05</span>, which confirms the null hypothesis.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">chi2(observed=[], expected=None, df=None)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import chi2
&gt;&gt;&gt; print chi2(observed=[[15, 22, 27, 21], [37, 40, 52, 41]])
(0.63, 0.89)</pre></div>
<table class="border">
<tbody>
<tr>
<td>&nbsp;</td>
<td style="text-align: center;">2534</td>
<td style="text-align: center;">3544</td>
<td style="text-align: center;">4554</td>
<td style="text-align: center;">55+</td>
</tr>
<tr>
<td class="smallcaps" style="text-align: right;">cat owner</td>
<td class="inline_code" style="text-align: center;">15</td>
<td class="inline_code" style="text-align: center;">22</td>
<td class="inline_code" style="text-align: center;">27</td>
<td class="inline_code" style="text-align: center;">21</td>
</tr>
<tr>
<td class="smallcaps" style="text-align: right;">dog owner</td>
<td class="inline_code" style="text-align: center;">37</td>
<td class="inline_code" style="text-align: center;">40</td>
<td class="inline_code" style="text-align: center;">52</td>
<td class="inline_code" style="text-align: center;">41</td>
</tr>
</tbody>
</table>
<h3><a name="ks2"></a>Kolmogorov-Smirnov test</h3>
<p>The <span class="inline_code">ks2()</span> function or <a href="http://en.wikipedia.org/wiki/KolmogorovSmirnov_test">two-sample Kolmogorov-Smirnov test</a> can be used to test if two samples are drawn from the same distribution. It returns a <span class="inline_code">(d,</span> <span class="inline_code">p)</span>-tuple with maximum distance <span class="inline_code">d</span> and probability <span class="inline_code">p</span>&nbsp;(<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>). By default, the second sample <span class="inline_code">a2</span> is <span class="inline_code">NORMAL</span>, i.e., a list with <span class="inline_code">n</span> values from&nbsp;<span class="inline_code">gauss(mean(a1),</span> <span class="inline_code">stdev(a1))</span>.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">ks2(a1, a2=NORMAL, n=1000)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.metrics import ks2
&gt;&gt;&gt; ks2([70, 85, 85, 100, 100, 100, 115, 115, 130], n=10000)
(0.17, 0.94)&nbsp;</pre></div>
<p>The values in the given list appear to be normally distributed (bell-shape).</p>
<p>&nbsp;</p>
<hr />
<h2>See also</h2>
<ul>
<li><a href="http://www.scipy.org/">Scipy</a> (BSD): scientific computing for Python.</li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,105 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-nl</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-nl" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-nl</a></div>
<h1>pattern.nl</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1418" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">The pattern.nl module contains a fast part-of-speech tagger for Dutch (identifies nouns, adjectives, verbs, etc. in a sentence), sentiment analysis, and tools for Dutch verb conjugation and noun singularization &amp; pluralization.</span></p>
<p>It can be used by itself or with other&nbsp;<a href="pattern.html">pattern</a>&nbsp;modules:&nbsp;<a href="pattern-web.html">web</a>&nbsp;|&nbsp;<a href="pattern-db.html">db</a>&nbsp;| <a href="pattern-en.html">en</a>&nbsp;|&nbsp;<a href="pattern-search.html">search</a>&nbsp;|&nbsp;<a href="pattern-vector.html">vector</a>&nbsp;|&nbsp;<a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema_nl.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<p>The functions in this module take the same parameters and return the same values as their counterparts in <a href="pattern-en.html">pattern.en</a>. Refer to the documentation there for more details.&nbsp;&nbsp;</p>
<h3>Noun singularization &amp; pluralization</h3>
<p>For Dutch nouns there is <span class="inline_code">singularize()</span> and <span class="inline_code">pluralize()</span>.&nbsp;The implementation is slightly less robust than the English version&nbsp;(accuracy 91% for singularization and 80% for pluralization).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.nl import singularize, pluralize
&gt;&gt;&gt;
&gt;&gt;&gt; print singularize('katten')
&gt;&gt;&gt; print pluralize('kat')
kat
katten </pre></div>
<h3>Verb conjugation</h3>
<p>For Dutch verbs there is <span class="inline_code">conjugate()</span>, <span class="inline_code">lemma()</span>, <span class="inline_code">lexeme()</span> and <span class="inline_code">tenses()</span>.&nbsp;The lexicon for verb conjugation contains about 4,000 common Dutch verbs. For unknown verbs it will fall back to a rule-based approach with an accuracy of about 81%.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.nl import conjugate
&gt;&gt;&gt; from pattern.nl import INFINITIVE, PRESENT, SG
&gt;&gt;&gt;
&gt;&gt;&gt; print conjugate('ben', INFINITIVE)
&gt;&gt;&gt; print conjugate('ben', PRESENT, 2, SG)
zijn
bent </pre></div>
<h3>Attributive &amp; predicative adjectives&nbsp;</h3>
<p>Dutch adjectives followed by a noun&nbsp;inflect with an&nbsp;<span class="inline_code">-e</span>&nbsp;suffix (e.g., <em>braaf</em>&nbsp;<em>brave kat</em>). You can get the base form with the <span class="inline_code">predicative()</span> function, or vice versa with&nbsp;<span class="inline_code">attributive()</span>. Accuracy is 99%.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.nl import attributive, predicative
&gt;&gt;&gt;
&gt;&gt;&gt; print predicative('brave')
&gt;&gt;&gt; print attributive('braaf')
braaf
brave </pre></div>
<h3 class="example">Sentiment analysis</h3>
<p class="example">For opinion mining there is <span class="inline_code">sentiment()</span>, which returns a (<span class="inline_code">polarity</span>, <span class="inline_code">subjectivity</span>)-tuple, based on a lexicon of adjectives.&nbsp;Polarity is a value between <span class="inline_code">-1.0</span> and <span class="inline_code">+1.0</span>, subjectivity between <span class="inline_code">0.0</span> and <span class="inline_code">1.0</span>. The accuracy is around 82% (P 0.79, R 0.86) for book reviews:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.nl import sentiment
&gt;&gt;&gt; print sentiment('Een onwijs spannend goed boek!')
(0.69, 0.90) </pre></div>
<h3>Parser</h3>
<p>For parsing there is <span class="inline_code">parse()</span>, <span class="inline_code">parsetree()</span> and <span class="inline_code">split()</span>. The <span class="inline_code">parse()</span> function annotates words in the given string with their part-of-speech <a class="link-maintenance" href="mbsp-tags.html">tags</a> (e.g., <span class="postag">NN</span> for nouns and <span class="postag">VB</span> for verbs). The parsetree() function takes a string and returns a tree of nested objects (<span class="inline_code">Text</span><span class="inline_code">Sentence</span><span class="inline_code">Chunk</span><span class="inline_code">Word</span>). The <span class="inline_code">split()</span> function takes the output of <span class="inline_code">parse()</span> and returns a <span class="inline_code">Text</span>. See the pattern.en documentation (<a class="link-maintenance" href="pattern-en.html#tree">here</a>) how to manipulate <span class="inline_code">Text</span> objects.&nbsp;</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.nl import parse, split
&gt;&gt;&gt;
&gt;&gt;&gt; s = parse('De kat zit op de mat.')
&gt;&gt;&gt; for sentence in split(s):
&gt;&gt;&gt; print sentence
Sentence('De/DT/B-NP/O kat/NN/I-NP/O zit/VBZ/B-VP/O op/IN/B-PP/B-PNP'
'de/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O')</pre></div>
<p>The parser is built on Jeroen Geertzen's <a href="http://cosmion.net/jeroen/software/brill_pos/" target="_blank">Dutch language model</a>.&nbsp;The accuracy is around 91%. The original&nbsp;<a href="http://lands.let.ru.nl/literature/hvh.1999.2.ps" target="_blank">WOTAN</a> tagset&nbsp;is mapped to <a href="mbsp-tags.html">Penn Treebank</a>. If you need to work with the original tags you can also use&nbsp;<span class="inline_code">parse()</span> with an optional parameter <span class="inline_code">tagset="WOTAN"</span>.</p>
<p class="small"><span style="text-decoration: underline;">Reference</span>: Geertzen, J. (2010).&nbsp;<em>Brill-NL. </em>Retrieved from: <a class="noexternal" style="color: inherit;" href="http: //cosmion.net/jeroen/software/brill_pos/" target="_blank">http: //cosmion.net/jeroen/software/brill_pos/</a>.</p>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,424 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-search</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-search" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-search</a></div>
<h1>pattern.search</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1357" class="node node-type-page"><div class="node-inner">
<div class="content">
<p class="big">The pattern.search module has a pattern matching system similar to regular expressions, that can be used to search a string by syntax (word function) or by semantics (word meaning).<span class="blue"> </span></p>
<p>It can be used by itself or with other <a href="pattern.html">pattern</a> modules: <a href="pattern-web.html">web</a> | <a href="pattern-db.html">db</a> | <a href="pattern-en.html">en</a> | search <span class="blue"> </span>| <a href="pattern-vector.html">vector</a> | <a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<ul>
<li><a href="#introduction">Searching + matching in a nutshell</a></li>
<li><a href="#pattern">Pattern</a></li>
<li><a href="#constraint">Constraint</a></li>
<li><a href="#match">Match</a></li>
<li><a href="#taxonomy">Taxonomy</a></li>
<li><a href="#utility">Utility functions</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="introduction"></a>Searching + matching in a nutshell</h2>
<p>The <span class="inline_code">search()</span> function takes a string (e.g., a word or a sequence of words) and returns a list of non-overlapping matches in the given sentence. The <span class="inline_code">match()</span> function returns the first match, or <span class="inline_code">None</span>. Both functions call <span class="inline_code">compile()</span>, which takes a string and returns a <span class="inline_code">Pattern</span> object.</p>
<pre class="brush:python; gutter:false; light:true;">search(pattern, sentence)</pre><pre class="brush:python; gutter:false; light:true;">match(pattern, sentence)</pre><pre class="brush:python; gutter:false; light:true;">compile(pattern)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import search
&gt;&gt;&gt; print search('rabbit', 'big white rabbit')
[Match(words=[Word('rabbit')])]</pre></div>
<p>Search strings can contain a wildcard character at the <span class="inline_code">*start</span>, at the <span class="inline_code">end*</span>, at <span class="inline_code">*both*</span> ends or <span class="inline_code">in*between</span>:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; print search('rabbit*', 'big white rabbit')
&gt;&gt;&gt; print search('rabbit*', 'big white rabbits')
[Match(words=[Word('rabbit')])]
[Match(words=[Word('rabbits')])]
</pre></div>
<p>Search strings can contain multiple options separated by a vertical dash:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; print search('rabbit|cony|bunny', 'big black bunny')
[Match(words=[Word('bunny')])]</pre></div>
<h3>Syntactical pattern matching</h3>
<p>The examples above can also be resolved with (faster) regular expressions. The pattern.search module is more useful with <em>parsed</em> sentences. The pattern.en module has a <a class="link-maintenance" href="pattern-en.html#parser">parser</a> that takes a string and assigns a part-of-speech tag to each word (e.g., <span class="postag">NN</span> = noun, <span class="postag">VB</span> = verb, <span class="postag">JJ</span> = adjective). The parser also groups words into chunks (e.g., <span class="postag">JJ</span> + <span class="postag">NN</span> = <span class="postag">NP</span> = noun phrase) and finds word lemmata (was → be).</p>
<p>A parsed <span class="inline_code">Sentence</span> or <span class="inline_code">Text</span> can be searched by part-of-speech tags:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import search
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; t = parsetree('big white rabbit')
&gt;&gt;&gt; print t
&gt;&gt;&gt; print
&gt;&gt;&gt; print search('JJ', t) # all adjectives
&gt;&gt;&gt; print search('NN', t) # all nouns
&gt;&gt;&gt; print search('NP', t) # all noun phrases
[Sentence('big/JJ/B-NP/O white/JJ/I-NP/O rabbit/NN/I-NP/O')]
[Match(words=[Word(u'big/JJ')]), Match(words=[Word(u'white/JJ')])]
[Match(words=[Word(u'rabbit/NN')])]
[Match(words=[Word(u'big/JJ'), Word(u'white/JJ'), Word(u'rabbit/NN')])]</pre></div>
<h3>Semantical pattern matching</h3>
<p>A <span class="inline_code">Taxonomy</span> can be used to define semantical categories of words. Say we want to extract flower names from a text. The search pattern is rather clumsy: <span class="inline_code">"rose|lily|daisy|daffodil|begonia"</span>. A more robust approach is to work with a taxonomy:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import search, taxonomy
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; for f in ('rose', 'lily', 'daisy', 'daffodil', 'begonia'):
&gt;&gt;&gt; taxonomy.append(f, type='flower')
&gt;&gt;&gt;
&gt;&gt;&gt; t = parsetree('A field of white daffodils.', lemmata=True)
&gt;&gt;&gt; print t
&gt;&gt;&gt; print
&gt;&gt;&gt; print search('FLOWER', t)
[Sentence('A/DT/B-NP/O/a field/NN/I-NP/O/field of/IN/B-PP/B-PNP/of'
'white/JJ/B-NP/I-PNP/white daffodils/NNS/I-NP/I-PNP/daffodil ././O/O/.')]
[Match(words=[Word(u'white/JJ'), Word(u'daffodils/NNS')])]
</pre></div>
<p>Note how the search pattern has <span class="inline_code">"FLOWER"</span> in uppercase. Since <span class="inline_code">search()</span> is case-insensitive, uppercase words are recognized as taxonomy terms (i.e., <span class="postag">FLOWER</span> = rose + lily + daisy + daffodil + begonia). Furthermore, since lemmata were parsed, <em>daffodils</em> is recognized as the plural form of <em>daffodil</em> (the lemma), and as such also part of <span class="postag">FLOWER</span>.</p>
<p>Note that the returned match is <em>white daffodils</em>. Since <span class="inline_code">search()</span> is (by default) greedy, the whole <span class="postag">NP</span> chunk is matched. In other words, <em>white daffodils</em> is regarded as a more specific instance of <em>daffodil</em>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="pattern"></a>Pattern</h2>
<p>A <span class="inline_code">Pattern</span> is a sequence of constraints that matches certain phrases in a (parsed) sentence. Each constraint can match a word in the sentence. If a number of successive words corresponds to the entire sequence of constraints, the phrase is a match. The search is case-insensitive.</p>
<p>Constraints can be constructed for syntax (e.g., find all adjectives) and semantics (e.g., find all product names). For example, <span class="inline_code">Pattern.fromstring("NP</span> <span class="inline_code">be</span> <span class="inline_code">*</span> <span class="inline_code">than</span> <span class="inline_code">NP")</span> matches phrases such as <em><span style="text-decoration: underline;">the cat</span> was faster than <span style="text-decoration: underline;">the mouse</span></em>, and <em><span style="text-decoration: underline;">Chuck Norris</span> is cooler than <span style="text-decoration: underline;">Dolph Lundgren</span></em>, since <span class="postag">NP</span> matches any noun phrase.<em> </em>With <span class="inline_code">TAXONOMY</span>, the global <span class="inline_code">taxonomy</span> is used to categorize words.</p>
<pre class="brush:python; gutter:false; light:true;">pattern = Pattern(sequence=[])</pre><pre class="brush:python; gutter:false; light:true;">pattern = Pattern.fromstring(string, taxonomy=TAXONOMY)</pre><pre class="brush:python; gutter:false; light:true;">pattern.sequence # List of Constraint objects.
pattern.groups # List of groups, each a list of Constraint objects.
pattern.strict # Disable greedy matching?
</pre><pre class="brush:python; gutter:false; light:true;">pattern.scan(string)
pattern.search(sentence)
pattern.match(sentence, start=0)</pre><ul>
<li><span class="inline_code">Pattern.scan()</span>&nbsp;returns <span class="inline_code">True</span> if <span class="inline_code">Sentence(string)</span> <em>may</em>&nbsp;yield matches.<br />It can be faster to scan a tagged string, before casting it to a <span class="inline_code">Sentence</span>&nbsp;or <span class="inline_code">Text</span> and searching it.&nbsp;</li>
<li><span class="inline_code">Pattern.search()</span> returns a list of <span class="inline_code">Match</span> objects from the given sentence.</li>
<li><span class="inline_code">Pattern.match()</span> returns the first <span class="inline_code">Match</span> found in the given sentence, or <span class="inline_code">None</span>.</li>
</ul>
<div>For example:</div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.search import Pattern
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; t = parsetree('Chuck Norris is cooler than Dolph Lundgren.', lemmata=True)
&gt;&gt;&gt; p = Pattern.fromstring('{NP} be * than {NP}')
&gt;&gt;&gt; m = p.match(t)
&gt;&gt;&gt; print m.group(1)
&gt;&gt;&gt; print m.group(2)
[Word(u'Chuck/NNP'), Word(u'Norris/NNP')]
[Word(u'Dolph/NNP'), Word(u'Lundgren/NNP')]</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="constraint"></a>Constraint</h2>
<p>A <span class="inline_code">Constraint</span> matches a set of (tagged) words and taxonomy terms. For example:</p>
<ul>
<li><span class="inline_code">Constraint.fromstring('with|of')</span> matches either <em>with</em> or <em>of</em>.</li>
<li><span class="inline_code">Constraint.fromstring('JJ?')</span> matches any adjective tagged <span class="postag">JJ</span>, but it is optional.</li>
<li><span class="inline_code">Constraint.fromstring('NP|SBJ')</span> matches subject noun phrases.</li>
<li><span class="inline_code">Constraint.fromstring('QUANTITY')</span> matches siblings of <span class="postag">QUANTITY</span> in the taxonomy.</li>
</ul>
<pre class="brush:python; gutter:false; light:true;">constraint = Constraint(
words = [],
tags = [],
chunks = [],
roles = [],
taxa = [],
optional = False,
multiple = False,
first = False,
taxonomy = TAXONOMY,
exclude = None,
custom = None )</pre><pre class="brush:python; gutter:false; light:true;">constraint = Constraint.fromstring(string, **kwargs)</pre><pre class="brush:python; gutter:false; light:true;">constraint.index
constraint.string
constraint.words # List of allowed words/lemmata (of, with, ...)
constraint.tags # List of allowed parts-of-speech (NN, JJ, ...)
constraint.chunks # List of allowed chunk types (NP, VP, ...)
constraint.roles # List of allowed chunk roles (SBJ, OBJ, ...)
constraint.taxa # List of allowed taxonomy terms.
constraint.taxonomy # Taxonomy used for lookup.
constraint.optional # True =&gt; matches zero or one word.
constraint.multiple # True =&gt; matches one or more words.
constraint.first # True =&gt; can only match first word.
constraint.exclude # None, or Constraint of disallowed options.
constraint.custom # function(word) returns True if match. </pre><pre class="brush:python; gutter:false; light:true;">constraint.match(word)</pre><h3>Constraint string syntax</h3>
<p><span class="inline_code">Constraint.fromstring()</span> returns a new <span class="inline_code">Constraint</span> from the given string. It takes the same optional parameters as the constructor. Uppercase words in the given string indicate a <a class="link-maintenance" href="MBSP-tags.html">part-of-speech tag</a> (e.g., <span class="postag">NN</span>, <span class="postag">JJ</span>, <span class="postag">VP</span>) or a taxonomy term (e.g. <span class="postag">PRODUCT</span>, <span class="postag">PERSON</span>).</p>
<p>Some characters like <span class="inline_code">|</span> or <span class="inline_code">?</span> are special. They affect how the constraint is interpreted:</p>
<table class="border">
<tbody>
<tr>
<td style="text-align: center;"><span class="smallcaps">Character</span></td>
<td><span class="smallcaps">Example</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">(</span></td>
<td><span class="inline_code">(JJ)</span></td>
<td>Wrapper for an optional constraint (deprecated).</td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">[</span></td>
<td><span class="inline_code">[Mac OS X | Windows Vista]</span></td>
<td>Wrapper for a constraint that has spaces.<span class="inline_code"> </span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">{</span></td>
<td><span class="inline_code">DT {JJ?} NN</span></td>
<td>Wrapper for match groups, e.g., <span class="inline_code">Match.group(1)</span>.</td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">_</span></td>
<td><span class="inline_code">Windows_Vista</span></td>
<td>Converted to a space.<span class="inline_code"> </span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">|</span></td>
<td><span class="inline_code">ADJP|ADVP</span></td>
<td>Separator for different options.</td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">*</span></td>
<td><span class="inline_code">JJ*</span></td>
<td>Used as a wildcard character. <span class="inline_code"> </span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">!</span></td>
<td><span class="inline_code">!be|VB*</span></td>
<td>Used in front of words/tags that are <span style="text-decoration: underline;">not</span> allowed.</td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">?</span></td>
<td><span class="inline_code">JJ?</span></td>
<td>Used as a suffix, constraint is optional.</td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">+</span></td>
<td><span class="inline_code">RB|JJ+</span> or <span class="inline_code">JJ?+</span> or <span class="inline_code">*+</span></td>
<td>Used as a suffix, constraint can span multiple words.<span class="inline_code"> </span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">^</span></td>
<td><span class="inline_code">^hello</span></td>
<td>Used as a prefix, constraint can only match first word.</td>
</tr>
</tbody>
</table>
<p>The characters listed in the table must be escaped if used as content (e.g., <span class="inline_code">"\?"</span>). You can use the module's <span class="inline_code">escape()</span> function. For example, <span class="inline_code">escape("hello?")</span> returns <span class="inline_code">"hello\?"</span>.</p>
<h3>Constraint matching</h3>
<p><span class="inline_code">Constraint.match()</span> returns <span class="inline_code">True</span> if the given string or <span class="inline_code">Word</span> is part of the constraint:</p>
<ul>
<li>the word (or its lemma) occurs in <span class="inline_code">Constraint.words</span>, OR,</li>
<li>the word (or its lemma) occurs in <span class="inline_code">Constraint.taxa</span> taxonomy tree, AND</li>
<li>the word tags and/or chunk tags match those defined in the constraint.</li>
</ul>
<p>It is case-insensitive. Individual terms in <span class="inline_code">Constraint.words</span> can contain wildcards (<span class="inline_code">*</span>). Some part-of-speech-tags can also contain wildcards: <span class="postag">NN*</span>, <span class="postag">VB*</span>, <span class="postag">JJ*</span>, <span class="postag">RB*</span>, <span class="postag">PR*</span>, <span class="postag">WP*</span>.</p>
<p>The following example demonstrates the use of optional and multiple constraints:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import search
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; t = parsetree('tasty cat food')
&gt;&gt;&gt; print t
&gt;&gt;&gt; print
&gt;&gt;&gt; print search('DT? RB? JJ? NN+', t)
[Sentence('tasty/JJ/B-NP/O cat/NN/I-NP/O food/NN/I-NP/O')]
[Match(words=[Word(u'tasty/JJ'), Word(u'cat/NN')]), Word(u'food/NN')])]</pre></div>
<p>The pattern matches successive nouns (<span class="postag">NN</span>), optionally preceded by a determiner (<span class="postag">DT</span>), adverb (<span class="postag">RB</span>) and/or adjective (<span class="postag">JJ</span>). It matches anything from <em>food</em> to <em>cat food</em>, <em>tasty cat food</em>, <em>the tasty cat food</em>, etc.</p>
<h3>Constraint = greedy</h3>
<p>The pattern.en parser groups words that belong together into chunks. For example, <em>the black cat</em> is one chunk, tagged <span class="postag">NP</span> (i.e., a noun phrase). The head of the chunk is <em>cat</em>. By default, when a constraint matches the chunk head, it will greedily match the entire chunk. This means that if we search for <em>cat</em> and the sentence has <em>a big black cat</em>, the entire chunk will be returned.</p>
<p>This behavior can be disabled by passing a <span class="inline_code">STRICT</span> flag to <span class="inline_code">Pattern</span>, <span class="inline_code">compile()</span>, <span class="inline_code">search()</span> or <span class="inline_code">match()</span>:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import search, STRICT
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; t = parsetree('The black cat is lurking in the tree.')
&gt;&gt;&gt; print search('cat', t)
[Match(words=[Word(u'The/DT'), Word(u'black/JJ'), Word(u'cat/NN')])]
</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; print search('cat', t, STRICT)
[Match(words=[Word(u'cat/NN')])]
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="match"></a>Match</h2>
<p><span class="inline_code">Pattern.search()</span> returns a list of <span class="inline_code">Match</span> objects, where each match is a list of successive <span class="inline_code">Word</span> objects.</p>
<pre class="brush:python; gutter:false; light:true;">match = Match(pattern, words=[])</pre><pre class="brush:python; gutter:false; light:true;">match.pattern # Pattern source.
match.words # List of Word objects.
match.string # String of words separated with a space.
match.start # Index of first word in sentence.
match.stop # Index of last word in sentence + 1.</pre><pre class="brush:python; gutter:false; light:true;">match.group(index, chunked=False)
match.constraint(word)
match.constraints(chunk)
match.constituents(constraint=None)</pre><ul>
<li><span class="inline_code">Match.group()</span> returns a list of <span class="inline_code">Word</span> objects matching the constraints in a <span class="inline_code">{</span> <span class="inline_code">}</span> group.</li>
<li><span class="inline_code">Match.constraint()</span> returns the <span class="inline_code">Constraint</span> that matched the given <span class="inline_code">Word</span>, or <span class="inline_code">None</span>.</li>
<li><span class="inline_code">Match.constraints()</span> returns the list of constraints that matched the given <span class="inline_code">Chunk</span>.</li>
<li><span class="inline_code">Match.constituents()</span> returns a list of <span class="inline_code">Word</span> and <span class="inline_code">Chunk</span> objects, with successive words grouped into chunks whenever possible. Optionally, returns only chunks/words that matched the given <span class="inline_code">Constraint</span> (or list of constraints). Chunks are only available if a <span class="inline_code">Sentence</span> or <span class="inline_code">Text</span> was given (i.e., not for plain strings).</li>
</ul>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import match
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; t = parsetree('The turtle was faster than the hare.', lemmata=True)
&gt;&gt;&gt; m = match('NP be ADJP|ADVP than NP', t)
&gt;&gt;&gt;
&gt;&gt;&gt; for w in m.words:
&gt;&gt;&gt; print w, '\t =&gt;', m.constraint(w)
Word(u'The/DT') =&gt; Constraint(chunks=['NP'])
Word(u'turtle/NN') =&gt; Constraint(chunks=['NP'])
Word(u'was/VBD') =&gt; Constraint(words=['be'])
Word(u'faster/RBR') =&gt; Constraint(chunks=['ADJP', 'ADVP'])
Word(u'than/IN') =&gt; Constraint(words=['than'])
Word(u'the/DT') =&gt; Constraint(chunks=['NP'])
Word(u'hare/NN') =&gt; Constraint(chunks=['NP'])
</pre></div>
<h3>Match groups</h3>
<p>Match groups in the search pattern can be used to quickly retrieve what you need from a <span class="inline_code">Match</span>:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; t = parsetree('the big black dog')
&gt;&gt;&gt; m = match('DT {JJ?+ NN}', t)
&gt;&gt;&gt; print m.group(0) # full pattern
&gt;&gt;&gt; print m.group(1) # {JJ?+ NN}
&gt;&gt;&gt; print m.group(1).string
[Word(u'the/DT'), Word(u'big/JJ'), Word(u'black/JJ'), Word(u'dog/NN')]
[Word(u'big/JJ'), Word(u'black/JJ'), Word(u'dog/NN')]
'big black dog'</pre></div>
<h3>Match words</h3>
<p>Each <span class="inline_code">Word</span> in a <span class="inline_code">Match</span> or <span class="inline_code">Match.group()</span> has the following attributes:</p>
<pre class="brush:python; gutter:false; light:true;">word = Word(sentence, string, tag=None, index=0)</pre><pre class="brush:python; gutter:false; light:true;">word.string
word.tag # Part-of-speech tag (e.g. NN, JJ).
word.sentence # Sentence (a list of successive Words).
word.index # Sentence index.
</pre><p>When <span class="inline_code">search()</span> or <span class="inline_code">match()</span> is given a string, <span class="inline_code">Word</span> objects are created when the <span class="inline_code">Match</span> is returned. When given a parsed <span class="inline_code">Sentence</span>, <span class="inline_code">Word</span> objects are linked from the sentence. These have extra attributes. For an overview of <span class="inline_code">Sentence</span>, <span class="inline_code">Chunk</span> and <span class="inline_code">Word</span>, see the <a class="link-maintenance" href="pattern-en.html#tree">parse tree</a> documentation.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="taxonomy"></a>Taxonomy</h2>
<p>A taxonomy is a hierarchical tree of words classified by semantic type. For example: a <em>begonia</em> is a <em>flower</em>, and a <em>flower</em> is a <em>plant</em>. Taxonomy terms can be used as constraints. For example, <span class="inline_code">"FLOWER"</span> will match <em>flower</em> as well as <em>begonia</em>, or any other flower that has been defined in the taxonomy. By default, constraints will retrieve terms from the global <span class="inline_code">taxonomy</span>.</p>
<pre class="brush:python; gutter:false; light:true;">taxonomy = Taxonomy()</pre><pre class="brush:python; gutter:false; light:true;">taxonomy.case_sensitive # False by default.
taxonomy.classifiers # List of Classifier objects.</pre><pre class="brush:python; gutter:false; light:true;">taxonomy.append(term, type=None)
taxonomy.remove(term)</pre><pre class="brush:python; gutter:false; light:true;">taxonomy.classify(term)
taxonomy.parents(term, recursive=False)
taxonomy.children(term, recursive=False)
</pre><ul>
<li><span class="inline_code">Taxonomy.classify()</span> returns the (most recent) semantic type for a given term.<br />If the term is not in the taxonomy, it will try <span class="inline_code">Taxonomy.classifiers</span> (see further).</li>
<li><span class="inline_code">Taxonomy.parents()</span> returns a list of all semantic types for the given term.</li>
<li><span class="inline_code">Taxonomy.children()</span> returns a list of all terms for the given semantic type.<br />With <span class="inline_code">recursive=True</span>, traverses the entire branch.</li>
</ul>
<p>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import taxonomy, search
&gt;&gt;&gt;
&gt;&gt;&gt; taxonomy.append('chicken', type='food')
&gt;&gt;&gt; taxonomy.append('chicken', type='bird')
&gt;&gt;&gt; taxonomy.append('penguin', type='bird')
&gt;&gt;&gt; taxonomy.append('bird', type='animal')
&gt;&gt;&gt;
&gt;&gt;&gt; print taxonomy.parents('chicken')
&gt;&gt;&gt; print taxonomy.children('animal', recursive=True)
&gt;&gt;&gt; print
&gt;&gt;&gt; print search('FOOD', "I'm eating chicken.")
['bird', 'food']
['bird', 'penguin', 'chicken']
[Match(words=[Word('chicken')])]</pre></div>
<h3>Taxonomy classifier</h3>
<p>A <span class="inline_code">Classifier</span> offers a rule-based approach to enrich the taxonomy. If a term is not in the taxonomy, it will iterate its list of classifiers. Each classifier is a set of functions that can be customized to yield the semantic type of a term.</p>
<pre class="brush:python; gutter:false; light:true;">classifier = Classifier(
parents = lambda term: [],
children = lambda term: [])</pre><pre class="brush:python; gutter:false; light:true;">classifier.parents(term) # Returns a list of parents for a term.
classifier.children(term) # Returns a list of children for a term.
</pre><p>This is useful because taxonomy terms can't include wildcards (i.e., the <span class="inline_code">*</span> character is taken literally).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import taxonomy, search
&gt;&gt;&gt; from pattern.search import Classifier
&gt;&gt;&gt;
&gt;&gt;&gt; def parents(term):
&gt;&gt;&gt; return ['quality'] if term.endswith('ness') else []
&gt;&gt;&gt;
&gt;&gt;&gt; taxonomy.classifiers.append(Classifier(parents))
&gt;&gt;&gt; taxonomy.append('cat', type='animal')
&gt;&gt;&gt;
&gt;&gt;&gt; print search('QUALITY of a|an|the ANIMAL', 'the litheness of a cat')
[Match(words=[Word('litheness'), Word('of'), Word('a'), Word('cat')])]</pre></div>
<p>This example creates a classifier that tags words ending in <em>-ness</em> as <span class="postag">quality</span> (e.g., sharpness, greediness). This is more concise than manually adding all words ending in <em>-ness</em> to the taxonomy. The <span class="postag">quality</span> term is then used as a constraint.&nbsp;Remember to always define <span class="inline_code">Classifier.parents()</span>. For performance, <span class="inline_code">Classifier.children()</span> is not called in <span class="inline_code">Constraint.match()</span>.</p>
<h3 class="example">Taxonomy classifier from WordNet</h3>
<p class="example">The following example creates a rule-based taxonomy from the <span class="inline_code">pattern.en.wordnet</span> module:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.search import taxonomy, WordNetClassifier
&gt;&gt;&gt;
&gt;&gt;&gt; taxonomy.classifiers.append(WordNetClassifier())
&gt;&gt;&gt;
&gt;&gt;&gt; print taxonomy.parents('cat', pos='NN')
&gt;&gt;&gt; print taxonomy.parents('cat', pos='VB')
['feline']
['flog']</pre></div>
<table class="border">
<tbody>
<tr>
<td style="text-align: center;">
<p><br /><img src="../g/pattern-search-taxonomy.jpg" alt="" width="300" height="163" /></p>
<p><span style="display: inline !important;"><br /><span class="smallcaps">wordnet taxonomy example</span></span></p>
</td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<hr />
<h2><a name="utility"></a>Utility functions</h2>
<p>The pattern.search module has a number of useful list functions:</p>
<pre class="brush:python; gutter:false; light:true;">unique(iterable) # Returns a new list with unique items.</pre><pre class="brush:python; gutter:false; light:true;">find(function, iterable) # Returns first item for which function(item) is True.</pre><pre class="brush:python; gutter:false; light:true;">product(iterable, repeat=1) # Returns a generator of all combinations of length n.</pre><pre class="brush:python; gutter:false; light:true;">variations(iterable, optional=lambda item: False)</pre><pre class="brush:python; gutter:false; light:true;">odict(items=[])</pre><ul>
<li><span class="inline_code">product()</span> returns a generator of all permutations, with replacement. <br />For example: <span class="inline_code">product([1,2,3),</span> <span class="inline_code">repeat=2)</span>&nbsp;yields:<br /><span class="inline_code">[1,1],</span> <span class="inline_code">[1,2],</span> <span class="inline_code">[1,3],</span> <span class="inline_code">[2,1],</span> <span class="inline_code">[2,2],</span> <span class="inline_code">[2,3],</span> <span class="inline_code">[3,1],</span> <span class="inline_code">[3,2],</span> <span class="inline_code">[3,3]</span></li>
<li><span class="inline_code">variations()</span> returns all variations of a sequence with optional items (in-order).</li>
<li><span class="inline_code">odict()</span> is a dictionary with ordered keys (e.g., like a stack).<br />The most recent keys will be returned first when traversing the dictionary.<br /><span class="inline_code">odict.push()</span> takes a <span class="inline_code">(key,</span> <span class="inline_code">value)</span>-tuple and sets the given key to the given value. If the key exists, it pushes the updated item to the top of the stack.</li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,115 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-shell</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-shell" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-shell</a></div>
<h1>pattern.shell</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1400" class="node node-type-page"><div class="node-inner">
<div class="content">
<h2>pattern.en parser</h2>
<p>The English parser can be invoked from the command-line.&nbsp;The&nbsp;<a href="pattern.html">pattern</a> module should be installed (i.e., located in <span class="inline_code">/site-packages</span>, see installation instructions) or the current working directory should be the one that contains the <span class="inline_code">pattern</span> folder.</p>
<pre class="brush:python; gutter:false; light:true;">&gt; python -m pattern.en -f file.txt</pre><p><span>If no options are given a full parse is executed (i.e. tokenization, tagging, chunking, relations and lemmata). Otherwise, you need to explicitly list every required option:</span></p>
<table class="border">
<tbody>
<tr>
<td><span class="inline_code">-O</span></td>
<td><span class="inline_code">--tokenize</span></td>
<td>Tokenize the input.</td>
</tr>
<tr>
<td><span class="inline_code">-T&nbsp;</span></td>
<td><span class="inline_code">--tags&nbsp;</span></td>
<td>Parse part-of-speech tags.</td>
</tr>
<tr>
<td><span class="inline_code">-C</span>&nbsp;</td>
<td><span class="inline_code">--chunks&nbsp;</span></td>
<td>Parse chunks and <span class="postag">PNP</span> tags.&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">-R</span>&nbsp;</td>
<td><span class="inline_code">--relations</span>&nbsp;</td>
<td>Parse verb/predicate relations.&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">-L</span>&nbsp;</td>
<td><span class="inline_code">--lemmata&nbsp;</span></td>
<td>Parse lemmata (<em>was</em><em>be</em>).&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">-f&nbsp;</span></td>
<td><span class="inline_code">--file</span>&nbsp;</td>
<td>Input file path.&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">-s&nbsp;</span></td>
<td><span class="inline_code">--string&nbsp;</span></td>
<td>Input string.&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">-e</span>&nbsp;</td>
<td><span class="inline_code">--encoding</span>&nbsp;</td>
<td>Specify character encoding (utf-8 by default).&nbsp;</td>
</tr>
<tr>
<td><span class="inline_code">-v&nbsp;</span></td>
<td class="inline_code">--version</td>
<td>Print current version of Pattern.</td>
</tr>
</tbody>
</table>
<p>Short options can be concatenated. Also note the <span class="inline_code">xml</span> option which produces XML output:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt; python -m pattern.en xml -OT -s 'The black cat sat on the mat.'</pre></div>
<h3><span>pattern.es | de | fr | it | nl parsers</span></h3>
<p><span>The parsers for other languages work in the same way. Note the <span class="inline_code">xml</span> option (produces XML output).</span></p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt; python -m pattern.es -s 'El gato negro se sienta en la estera.'</pre></div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt; python -m pattern.de -s 'Die schwarze Katze liegt auf der Matte.'</pre></div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt; python -m pattern.fr -s "Le chat noir s'était assis sur le tapis."</pre></div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt; python -m pattern.it -s 'Il gatto nero faceva le fusa.'</pre></div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt; python -m pattern.nl -s 'De zwarte kat zat op de mat.'</pre></div>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,937 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-vector</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-vector" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-vector</a></div>
<h1>pattern.vector</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1377" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">The pattern.vector module contains easy-to-use machine learning tools, starting from word count functions, bag-of-word documents and a vector space model, to latent semantic analysis and algorithms for clustering and classification (Naive Bayes, <em>k</em>-NN, Perceptron, SVM).</span></p>
<p>It can be used by itself or with other <a href="pattern.html">pattern</a> modules: <a href="pattern-web.html">web</a> | <a href="pattern-db.html">db</a> | <a href="pattern-en.html">en</a> | <a href="pattern-search.html">search</a> <span class="blue"> </span>| vector | <a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<ul>
<li><a href="#wordcount">Word count</a></li>
<li><a href="#tf-idf">TF-IDF<span class="smallcaps"> </span></a></li>
<li><a href="#document">Document</a></li>
<li><a href="#model">Model</a></li>
<li><a href="#lsa">Latent Semantic Analysis</a></li>
<li><a href="#cluster">Clustering</a> <span class="smallcaps link-maintenance">(<a href="#kmeans">k-means</a>, <a href="#hierarchical">hierarchical</a>)</span></li>
<li><a href="#classification">Classification</a> <span class="smallcaps link-maintenance">(<a href="#nb">nb</a>, <a href="#knn">knn</a>, <a href="#SLP">slp</a>,&nbsp;<a href="#svm">svm</a>)</span></li>
<li><a href="#ga">Genetic algorithm</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="wordcount"></a>Word count</h2>
<p>One way to measure which words in a text matter is to count the number of times each word appears in the text. Different texts can then be compared, based on the keywords they share. This is an important task in many <em>text mining</em> applications, e.g., search engines, social network monitoring, targeted ads, recommender systems ("you may also like"), and so on.</p>
<p>The <span class="inline_code">words()</span> and <span class="inline_code">count()</span> functions can be used to count words in a given string:</p>
<pre class="brush:python; gutter:false; light:true;">words(string,
filter = lambda w: w.strip("'").isalnum(),
punctuation = '.,;:!?()[]{}`''\"@#$^&amp;*+-|=~_')
</pre><pre class="brush:python; gutter:false; light:true;">count(
words = [],
top = None, # Filter words not in the top most frequent (int).
threshold = 0, # Filter words whose count &lt;= threshold.
stemmer = None, # PORTER | LEMMA | function | None
exclude = [], # Filter words in the exclude list.
stopwords = False, # Include stop words?
language = 'en') # en, es, de, fr, it, nl
</pre><ul>
<li><span class="inline_code">words()</span> returns a list of words by splitting the string on spaces.<br />Punctuation marks are stripped from words. If <span class="inline_code">filter(word)</span> is&nbsp;<span class="inline_code">False</span>, the word is excluded.</li>
<li><span class="inline_code">count()</span> takes a list of words and returns a dictionary of <span class="inline_code">(word,</span> <span class="inline_code">count)</span>-items.</li>
</ul>
<h3>Stop words &amp; stemming</h3>
<p><a href="https://github.com/clips/pattern/blob/master/pattern/vector/stopwords-en.txt">Stop words</a>&nbsp;are common words (e.g. I, the, very, about) that are ignored with <span class="inline_code">count(stopwords=False)</span>. There is no definite list of stop words, so you may need to tweak it.</p>
<p>With <span class="inline_code">count(stemmer=PORTER)</span>, the&nbsp;<span class="inline_code">stem()</span>&nbsp;function is used to normalize words. For example,&nbsp;<em>consisted</em> and <em>consistently</em> are stemmed to <em>consist</em>, and&nbsp;<em>spies</em> is stemmed to <em>spi</em>&nbsp;(<a href="http://tartarus.org/%7Emartin/PorterStemmer/">Porter2 stemming algorithm</a>).</p>
<p>With <span class="inline_code">count(stemmer=LEMMA)</span>, the&nbsp;<span class="inline_code">pattern.en.singularize()</span> and&nbsp;<span class="inline_code">conjugate()</span>&nbsp;functions are used to normalize words if a <a class="link-maintenance" href="pattern-en.html#parse">parsed</a> &nbsp;<span class="inline_code">Sentence</span> or <span class="inline_code">Text</span> is given. This is more robust, but also slower.</p>
<pre class="brush:python; gutter:false; light:true;">stem(word, stemmer=PORTER)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import stem, PORTER, LEMMA
&gt;&gt;&gt;
&gt;&gt;&gt; print stem('spies', stemmer=PORTER)
&gt;&gt;&gt; print stem('spies', stemmer=LEMMA)
spi
spy</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import count, words, PORTER, LEMMA
&gt;&gt;&gt;
&gt;&gt;&gt; s = 'The black cat was spying on the white cat.'
&gt;&gt;&gt; print count(words(s), stemmer=PORTER)
&gt;&gt;&gt; print count(words(s), stemmer=LEMMA)
{u'spi': 1, u'white': 1, u'black': 1, u'cat': 2}</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import count, LEMMA
&gt;&gt;&gt; from pattern.en import parse, Sentence
&gt;&gt;&gt;
&gt;&gt;&gt; s = 'The black cat was spying on the white cat.'
&gt;&gt;&gt; s = Sentence(parse(s))
&gt;&gt;&gt; print count(s, stemmer=LEMMA)
{u'spy': 1, u'white': 1, u'black': 1, u'cat': 2, u'.': 1}&nbsp;</pre></div>
<h3>Character <em>n</em>-grams</h3>
<p>Another counting technique is to split a text into sequences of <em>n</em> successive characters. Although these are more difficult to interpret, they can be quite effective for comparing texts.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">chngrams(string="", n=3, top=None, threshold=0, exclude=[])</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import chngrams
&gt;&gt;&gt; print chngrams('The cat sat on the mat.', n=3)
{' ca': 1, 'at ': 2, 'he ': 2, 't o': 1,
' ma': 1, 'at.': 1, 'mat': 1, 't s': 1,
' on': 1, 'cat': 1, 'n t': 1, 'the': 2,
' sa': 1, 'e c': 1, 'on ': 1,
' th': 1, 'e m': 1, 'sat': 1
}</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="tf-idf"></a>Term frequency inverse document frequency</h2>
<p>Word count or <em>term frequency</em> (tf) is a measure of a word's relevance in a text. Similarly, <em>document frequency</em>&nbsp;(df) is a measure of a word's relevance across multiple texts. Dividing term frequency by document frequency yields tf-idf, a measure of how important or unique a word is in a text in relation to other texts. For example, even if the words "the" and "is" may occur frequently in one text, they are not that important in this text, since they occur frequently in may other texts. This can be used to build a search engine, for example. If a user queries for "cat", the search engine returns the pages that have a high tf-idf for "cat".</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Metric</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td><span class="inline_code">tf</span></td>
<td>number of occurences of a word <span class="inline_code">/</span> number of words in document</td>
</tr>
<tr>
<td><span class="inline_code">df</span></td>
<td>number of documents containing a word <span class="inline_code">/</span> number of documents</td>
</tr>
<tr>
<td><span class="inline_code">idf</span></td>
<td><span class="inline_code">ln(1/df)</span></td>
</tr>
<tr>
<td><span class="inline_code">tf-idf</span></td>
<td><span class="inline_code">tf</span> <span class="inline_code">*</span> <span class="inline_code">idf</span></td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<hr />
<h2><a name="cos"></a>Cosine similarity</h2>
<p>A <em>document vector</em> is a dictionary of distinct words in a document (i.e., text, paragraph, sentence) with their tf-idf. Higher tf-idf indicates words that are more important (i.e., keywords). A collection of document vectors is called a <em>vector space model</em>, a matrix of words x documents. By calculating the matrix dot product (angle) of two document vectors, we can measure how similar they are. This is called <em>cosine similarity</em>.</p>
<p>Let <span class="inline_code">v1</span>, <span class="inline_code">v2</span> be <span class="inline_code">Document.vector</span> objects:</p>
<p><span class="inline_code">cosθ</span> <span class="inline_code">=</span> <span class="inline_code">dot(v1,</span> <span class="inline_code">v2)</span> <span class="inline_code">/</span> <span class="inline_code">(v1.norm</span>&nbsp;<span class="inline_code">*</span> <span class="inline_code">v2.norm) </span></p>
<p>&nbsp;</p>
<hr />
<h2><a name="document"></a>Document</h2>
<p>A <span class="inline_code">Document</span> is an unordered <em>bag-of-words</em> representation of a given string, dictionary of <span class="inline_code">(word,</span> <span class="inline_code">count)</span>-items,&nbsp;<span class="inline_code">Sentence</span>&nbsp;or&nbsp;<span class="inline_code">Text</span>.&nbsp;Documents can be bundled in a <span class="inline_code">Model</span>. Bag-of-words means that the word order in the given text is discarded. Instead, words are counted using the <span class="inline_code">words()</span>, <span class="inline_code">count()</span> and&nbsp;<span class="inline_code">stem()</span> and functions. This exposes keywords (= high word count) in the text, by which documents can be compared for similarity.</p>
<p>The <span class="inline_code">Document.words</span> dictionary maps words to word count. The generalized&nbsp;<span class="inline_code">Document.vector</span>&nbsp;dictionary maps&nbsp;<em>features</em> (e.g., words) to <em>feature weights</em> (e.g., relative word count). We call them features because they can be other things besides words in a text, for example id's or labels. For a document that is not part of a <span class="inline_code">Model</span>, the feature weights are <span class="inline_code">TF</span>, relative frequency between <span class="inline_code">0.0</span><span class="inline_code">1.0</span>. This is useful when comparing long vs. short texts. Say we have a 10,000-word document that mentions "cat"&nbsp;5000x and a 10-word document that mentions "cat" 5x. They are quite similar since they both mention "cat" 50% (0.5) of the time. Documents that are part of a <span class="inline_code">Model</span> can use different weighting schemes such as <span class="inline_code">TF</span>,&nbsp;<span class="inline_code">TFIDF</span>, <span class="inline_code">IG</span> and <span class="inline_code">BINARY</span>.</p>
<pre class="brush:python; gutter:false; light:true;">document = Document(string,
filter = lambda w: w.lstrip("'").isalnum(),
punctuation = '.,;:!?()[]{}\'`"@#$*+-|=~_',
top = None, # Filter words not in the top most frequent.
threshold = 0, # Filter words whose count falls below threshold.
exclude = [], # Filter words in the exclude list.
stemmer = None, # STEMMER | LEMMA | function | None.
stopwords = False, # Include stop words?
name = None,
type = None,
language = None,
description = None)</pre><pre class="brush:python; gutter:false; light:true;">document.id # Unique number (read-only).
document.name # Unique name, or None, used in Model.document().
document.type # Document type, used with classifiers.
document.language # Document language (e.g., 'en').
document.description # Document info.
document.model # The parent Model, or None.
document.features # List of words from Document.words.keys().
document.words # Dictionary of (word, count)-items (read-only).
document.wordcount # Total word count.
document.vector # Cached Vector (read-only dict).</pre><pre class="brush:python; gutter:false; light:true;">document.tf(word)
document.tfidf(word) # Note: simply yields tf if model is None.
document.keywords(top=10, normalized=True)
</pre><pre class="brush:python; gutter:false; light:true;">document.copy()
</pre><ul>
<li><span class="inline_code">Document.tf()</span> returns the frequency of a word as a number between <span class="inline_code">0.0-1.0</span>.</li>
<li><span class="inline_code">Document.tfidf()</span> returns the word's relevancy as tf-idf.<span class="inline_code"> </span></li>
<li><span class="inline_code">Document.keywords()</span> returns a sorted list of <span class="inline_code">(weight,</span> <span class="inline_code">word)</span>-tuples.<br />With <span class="inline_code">normalized=True</span>&nbsp;the weights will be between <span class="inline_code">0.0-1.0</span> (their sum is <span class="inline_code">1.0</span>).</li>
</ul>
<p>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import Document
&gt;&gt;&gt;
&gt;&gt;&gt; s = '''
&gt;&gt;&gt; The shuttle Discovery, already delayed three times by technical problems
&gt;&gt;&gt; and bad weather, was grounded again Friday, this time by a potentially
&gt;&gt;&gt; dangerous gaseous hydrogen leak in a vent line attached to the shipʼs
&gt;&gt;&gt; external tank. The Discovery was initially scheduled to make its 39th
&gt;&gt;&gt; and final flight last Monday, bearing fresh supplies and an intelligent
&gt;&gt;&gt; robot for the International Space Station. But complications delayed the
&gt;&gt;&gt; flight from Monday to Friday, when the hydrogen leak led NASA to conclude
&gt;&gt;&gt; that the shuttle would not be ready to launch before its flight window
&gt;&gt;&gt; closed this Monday.
&gt;&gt;&gt; '''
&gt;&gt;&gt; d = Document(s, threshold=1)
&gt;&gt;&gt; print d.keywords(top=6)
[(0.17, u'flight'),
(0.17, u'monday'),
(0.11, u'delayed'),
(0.11, u'discovery'),
(0.11, u'friday'),
(0.11, u'hydrogen')
]</pre></div>
<h3>Document vector</h3>
<p>A <span class="inline_code">Document.vector</span> is a read-only, sparse (non-zero values) <span class="inline_code">dict</span> of&nbsp;<span class="inline_code">(feature,</span> <span class="inline_code">weight)</span>-items, where weight is the relative frequency (<span class="inline_code">TF</span>) of a feature in the document. Documents can be bundled in a&nbsp;<span class="inline_code">Model</span> with other weighting schemes such as <span class="inline_code">TFIDF</span>, <span class="inline_code">IG</span> and <span class="inline_code">BINARY</span>.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">vector = Document.vector</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">vector = Vector(*args, **kwargs) # Same arguments as dict().</pre><p>The pattern.vector module has the following low-level functions for vectors (or&nbsp;<span class="inline_code">dicts</span>):</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">normalize(vector) # Adjusts weights so sum is 1.</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">tfidf(vectors=[], base=2.72) # Adjusts weights to tf * idf.</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">distance(v1, v2, method=COSINE) # COSINE | EUCLIDEAN | MANHATTAN | HAMMING</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">features(vectors=[] # Returns the set() of unique features.</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">centroid(vectors=[]) # Returns the mean Vector.</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">cluster(method=KMEANS, vectors=[], distance=COSINE, **kwargs)</pre><ul>
<li><span class="inline_code">relative()</span> and <span class="inline_code">tfidf()</span> modify and return the vectors in-place for performance.&nbsp;</li>
<li><span class="inline_code">distance()</span>&nbsp;can also take a user-defined function as&nbsp;<span class="inline_code">method</span>&nbsp;that returns <span class="inline_code">0.0</span><span class="inline_code">1.0</span>.<br />Cosine similarity for two vectors <span class="inline_code">v1</span> and <span class="inline_code">v2</span> = <span class="inline_code">1</span> <span class="inline_code">-</span> <span class="inline_code">distance(v1,</span> <span class="inline_code">v2)</span>.</li>
<li><span class="inline_code">cluster()</span> takes optional parameters <span class="inline_code">k</span>, <span class="inline_code">iterations</span>, <span class="inline_code">seed</span> and <span class="inline_code">p</span> see <a class="link-maintenance" href="#kmeans">clustering</a>.</li>
</ul>
<div>Here is a low-level approach (cf. what&nbsp;<span class="inline_code">Model</span> does under the hood) for calculating cosine similarity:</div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import Vector, distance
&gt;&gt;&gt;
&gt;&gt;&gt; v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1})
&gt;&gt;&gt; v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1})
&gt;&gt;&gt; print 1 - distance(v1, v2)
0.33</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="model"></a>Model</h2>
<p>A <span class="inline_code">Model</span> (previously <span class="inline_code">Corpus</span>) or <em>vector space model</em> is a collection of <span class="inline_code">Document</span> objects. Each <span class="inline_code">Document.vector</span> in a model is a dictionary of features (e.g., words) and feature weights (e.g., word count). Essentially, a model is then a sparse matrix with documents as rows, features as columns, and feature weights as cells. Mathematical functions can be used on the matrix. For example, to compute how similar two documents are, based on the features they have in common.</p>
<p>A <span class="inline_code">Model</span> has a weighting scheme that determines how the feature weights in each document vector are calculated. The <span class="inline_code">weight</span> parameter can be set to <span class="inline_code">TF</span> (relative term frequency), <span class="inline_code">TFIDF</span>, (term frequency vs. document frequency),&nbsp;<span class="inline_code">IG</span> (information gain), <span class="inline_code">BINARY</span> (<span class="inline_code">0</span> or <span class="inline_code">1</span>) or <span class="inline_code">None</span> (original weights).</p>
<pre class="brush:python; gutter:false; light:true;">model = Model(documents=[], weight=TFIDF)</pre><pre class="brush:python; gutter:false; light:true;">model = Model.load(path) # Imports file created with Model.save().</pre><pre class="brush:python; gutter:false; light:true;">model.documents # List of Documents (read-only).
model.document(name) # Yields document with given name (unique).
model.inverted # Dictionary of (word, set(documents))-items.
model.vector # Dictionary of (word, 0.0)-items.
model.vectors # List of all Document vectors.
model.features # List of all Document.vector.keys().
model.classes # List of all Document.type values.
model.weight # Feature weights: TF | TFIDF | IG | BINARY | None
model.density # Overall word coverage (0.0-1.0).
model.lsa # Concept space, set with Model.reduce().</pre><pre class="brush:python; gutter:false; light:true;">model.append(document)
model.remove(document)
model.extend(documents)
model.clear()</pre><pre class="brush:python; gutter:false; light:true;">model.df(word) # Document frequency (0.0-1.0).
model.idf(word) # log(1/df)
model.similarity(document1, document2) # Cosine similarity (0.0-1.0).
model.neighbors(document, top=10) # (similarity, document) list.
model.search(words=[], **kwargs) # (similarity, document) list.
model.distance(document1, document2, method=COSINE) # COSINE | EUCLIDEAN | MANHATTAN
model.cluster(documents=ALL, method=KMEANS) # KMEANS | HIERARCHICAL
model.reduce(dimensions=L2) # L1 | L2 | TOP300 | int</pre><pre class="brush:python; gutter:false; light:true;">model.infogain(word) # Entropy (≈predictability).
model.filter(features=[], documents=[]) # Model with selected features.
model.feature_selection(top=100, method=IG, threshold=0.0) # Informative features.
</pre><pre class="brush:python; gutter:false; light:true;">model.sets(threshold=0.5) # Frequent word sets.</pre><pre class="brush:python; gutter:false; light:true;">model.save(path, update=False)
model.export(path, format=ORANGE) # ORANGE | WEKA </pre><ul>
<li><span class="inline_code">Model.df()</span> returns document frequency of a feature, as a value between <span class="inline_code">0.0-1.0</span>.</li>
<li><span class="inline_code">Model.idf()</span> returns the inverse document frequency (or <span class="inline_code">None</span> if a feature is not in the model).</li>
<li><span class="inline_code">Model.similarity()</span> returns the cosine similarity of two <span class="inline_code">Documents</span> between <span class="inline_code">0.0-1.0</span>.<span class="inline_code"><br /></span></li>
<li><span class="inline_code">Model.neighbors()</span> returns a sorted list of <span class="inline_code">(similarity, Document)</span>-tuples.</li>
<li><span class="inline_code">Model.search()</span> returns a sorted list of <span class="inline_code">(similarity, Document)</span>-tuples, based on a list of query words. A <span class="inline_code">Document</span> is created on-the-fly for the given list, using the given optional arguments.</li>
<li><span class="inline_code">Model.sets()</span> returns a dictionary of <span class="inline_code">(set(words), frequency)</span>-items of word combinations and their relative frequency above the given threshold (<span class="inline_code">0.0-1.0</span>).</li>
</ul>
<p>The following example demonstrates the tf-idf weighting scheme and cosine similarity:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import Document, Model, TFIDF
&gt;&gt;&gt;
&gt;&gt;&gt; d1 = Document('A tiger is a big yellow cat with stripes.', type='tiger')
&gt;&gt;&gt; d2 = Document('A lion is a big yellow cat with manes.', type='lion',)
&gt;&gt;&gt; d3 = Document('An elephant is a big grey animal with a slurf.', type='elephant')
&gt;&gt;&gt;
&gt;&gt;&gt; print d1.vector
&gt;&gt;&gt;
&gt;&gt;&gt; m = Model(documents=[d1, d2, d3], weight=TFIDF)
&gt;&gt;&gt;
&gt;&gt;&gt; print d1.vector
&gt;&gt;&gt; print
&gt;&gt;&gt; print m.similarity(d1, d2) # tiger vs. lion
&gt;&gt;&gt; print m.similarity(d1, d3) # tiger vs. elephant
{u'tiger': 0.25, u'stripes': 0.25, u'yellow': 0.25, u'cat': 0.25} # TF
{u'tiger': 0.27, u'stripes': 0.27, u'yellow': 0.10, u'cat': 0.10} # TFIDF
0.12
0.0
</pre></div>
<p>In this example we created documents with descriptions of a <span class="smallcaps">tiger</span>, a <span class="smallcaps">lion</span> and an <span class="smallcaps">elephant</span>. When we print the <span class="smallcaps">tiger</span> vector, all the feature weights are equal (<span class="inline_code">TF</span>). But when we group the documents in a model, the weight of <span class="smallcaps">tiger</span> features <em>yellow</em> and <em>cat</em> diminishes, because these features also appear in <span class="smallcaps">lion</span> (<span class="inline_code">TFIDF</span>).</p>
<p>We then compare <span class="smallcaps">tiger</span> with <span class="smallcaps">lion</span> and <span class="smallcaps">elephant</span> and, as it turns out, <span class="smallcaps">tiger</span> is more similar to <span class="smallcaps">lion</span>. The similarity is quite low (12%), because in this example 2/3 of all documents (<span class="smallcaps">tiger</span> and <span class="smallcaps">lion</span>) share most of their features. If we continue to add, say, 10,000 documents for other animals (e.g. "A squirrel is a small rodent with a tail.") the similarity will rise, since the difference in word usage for different types of animals will stand out more clearly.</p>
<p>If we had multiple descriptions for each animal each a <span class="inline_code">Document</span> with a <span class="inline_code">type</span>&nbsp; we could use <span class="inline_code">Model.neighbors()</span> to retrieve a list of the top most similar documents for a given (unknown) document, and then check which type in the list predominates (= a majority vote). This is essentially what a <span class="inline_code">KNN</span> <a class="link-maintenance" href="#classifier">classifier</a> does.</p>
<h3>Model cache</h3>
<p>The calculations in <span class="inline_code">Model.df()</span> (document frequency), <span class="inline_code">Model.similarity()</span> (cosine similarity) and <span class="inline_code">Model.infogain()</span> (information gain) are cached for faster performance.</p>
<p>Note that whenever a document is added to or removed from a model with a <span class="inline_code">TFIDF</span> or <span class="inline_code">IG</span> weighting scheme, the cache is cleared, since new features will change the weights. So if you need to add a lot of documents (e.g., 10,000+), use <span class="inline_code">Document.extend()</span> with a list of documents for faster performance.</p>
<h3>Model import &amp; export</h3>
<p><span class="inline_code">Model.save()</span> exports the model as a binary file using the Python <span class="inline_code">cPickle</span> module, including the cache. With <span class="inline_code">Model.save(update=True)</span>, all possible vectors and similarities will be calculated and cached before saving. The classmethod <span class="inline_code">Model.load()</span> returns a <span class="inline_code">Model</span> from the given file created with <span class="inline_code">Model.save()</span>.</p>
<p><span class="inline_code">Model.export(</span>) exports a file that can be used with popular machine learning software. With <span class="inline_code">ORANGE</span>, it generates a tab-separated text file for <a href="http://orange.biolab.si/">Orange</a>. With <span class="inline_code">WEKA</span>, it generates an ARFF text file for <a href="http://www.cs.waikato.ac.nz/ml/weka/">Weka</a>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="lsa"></a>Latent semantic analysis</h2>
<p>Latent Semantic Analysis (LSA) is a statistical technique based on singular value decomposition (SVD). <span class="small"><a class="noexternal" href="http://en.wikipedia.org/wiki/Singular_value_decomposition" target="_blank">[1]</a> <a class="noexternal" href="http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html" target="_blank">[2]</a></span>. It groups related features in the model into concepts (e.g., <em>purr</em> + <em>fur</em> + <em>claw</em> = <span class="smallcaps">feline</span> concept). This is called dimensionality reduction. Each document in the model then gets a concept vector, a compressed approximation of the original vector that may be faster for cosine similarity, clustering and classification.</p>
<p>SVD requires the Python <a href="http://numpy.scipy.org/" target="_blank">NumPy</a> package (installed by default on Mac OS X). Given a matrix of documents ×&nbsp;features, it yields a matrix <span class="inline_code">U</span> with documents ×&nbsp;concepts, a diagonal matrix <span class="inline_code">Σ</span> with singular values, and a matrix <span class="inline_code">Vt</span> with concepts ×&nbsp;features.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">from numpy.linalg import svd
from numpy import dot, diag
u, sigma, vt = svd(matrix, full_matrices=False)
for i in range(-k, 0):
sigma[i] = 0 # Reduce k smallest singular values.
matrix = dot(u, dot(diag(sigma), vt))</pre></div>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: Wilk J. (2007). http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html</span></p>
<div class="example"><br />The following figure illustrates LSA for a&nbsp;document of words that commonly occur after the word <em>nice</em>:</div>
<table class="border" border="0">
<tbody>
<tr>
<td>
<p><br /><img style="display: block; margin-left: auto; margin-right: auto;" src="../g/pattern-vector-lsa1.jpg" alt="" /></p>
</td>
</tr>
</tbody>
</table>
<h3>LSA concept space</h3>
<p>The <span class="inline_code">Model.reduce()</span> method calculates SVD and stores the concept space in <span class="inline_code">Model.lsa</span>. The optional&nbsp;<span class="inline_code">dimensions</span> parameter defines the number of dimensions in the concept space: <span class="inline_code">TOP300</span>, <span class="inline_code">L1</span>, <span class="inline_code">L2</span>&nbsp;(default), an&nbsp;<span class="inline_code">int</span> or a function. There is no universal optimal value, too many dimensions may result in noise while too few may remove useful information.</p>
<p>When <span class="inline_code">Model.lsa</span> is set,&nbsp;<span class="inline_code">Model.similarity()</span>, <span class="inline_code">neighbors()</span>, <span class="inline_code">search()</span> and <span class="inline_code">cluster()</span>&nbsp;will subsequently compute in LSA concept space. To undo the reduction, set <span class="inline_code">Model.lsa</span> to <span class="inline_code">None</span>. Adding or removing documents in the model will also undo the reduction.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">lsa = Model.reduce(dimensions=L2)</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">lsa = Model.lsa</pre><pre class="brush:python; gutter:false; light:true;">lsa = LSA(model, k=L2)</pre><pre class="brush:python; gutter:false; light:true;">lsa.model # Parent Model.
lsa.features # List of features, same as Model.features.
lsa.concepts # List of concepts, each a {feature: weight} dict.
lsa.vectors # {Document.id: {concept_index: weight}}</pre><pre class="brush:python; gutter:false; light:true;">lsa.transform(document)</pre><table class="border">
<tbody>
<tr>
<td class="smallcaps">Dimensions</td>
<td class="smallcaps">Description</td>
</tr>
<tr>
<td class="inline_code">TOP300</td>
<td>Keep the top 300 dimensions (rule of thumb).</td>
</tr>
<tr>
<td class="inline_code">L1</td>
<td>L1-norm of the singular values as the number of dimensions to remove.</td>
</tr>
<tr>
<td class="inline_code">L2</td>
<td>L2-norm of the singular values as the number of dimensions to remove.</td>
</tr>
<tr>
<td class="inline_code">int</td>
<td>An <span class="inline_code">int</span> that is the number of dimensions to remove.</td>
</tr>
<tr>
<td class="inline_code">function</td>
<td>A function that takes the list of singular values and returns an int.</td>
</tr>
</tbody>
</table>
<p><span class="inline_code">LSA.transform()</span> takes a <span class="inline_code">Document</span> and returns its <span class="inline_code">Vector</span> in concept space. This is useful for documents that are not part of the model see also <span class="inline_code">Classifier.classify()</span>.</p>
<p>The following example demonstrates how related features are grouped after LSA:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import Document, Model
&gt;&gt;&gt;
&gt;&gt;&gt; d1 = Document('The cat purrs.', name='cat1')
&gt;&gt;&gt; d2 = Document('Curiosity killed the cat.', name='cat2')
&gt;&gt;&gt; d3 = Document('The dog wags his tail.', name='dog1')
&gt;&gt;&gt; d4 = Document('The dog is happy.', name='dog2')
&gt;&gt;&gt;
&gt;&gt;&gt; m = Model([d1, d2, d3, d4])
&gt;&gt;&gt; m.reduce(2)
&gt;&gt;&gt;
&gt;&gt;&gt; for d in m.documents:
&gt;&gt;&gt; print
&gt;&gt;&gt; print d.name
&gt;&gt;&gt; for concept, w1 in m.lsa.vectors[d.id].items():
&gt;&gt;&gt; for feature, w2 in m.lsa.concepts[concept].items():
&gt;&gt;&gt; if w1 != 0 and w2 != 0:
&gt;&gt;&gt; print (feature, w1 * w2)
</pre></div>
<p>The model is reduced to two dimensions. So there are two concepts in the concept space. Each document has a concept vector with weights for each concept. As illustrated below, cat features have been grouped together and dog features have been grouped together.</p>
<table class="border">
<tbody>
<tr>
<td style="width: 12%; text-align: center;"><span class="smallcaps">concept</span></td>
<td style="text-align: center;"><span>cat</span></td>
<td style="text-align: center;"><span>curiosity</span></td>
<td style="text-align: center;"><span>dog</span></td>
<td style="text-align: center;"><span>happy</span></td>
<td style="text-align: center;"><span>killed</span></td>
<td style="text-align: center;"><span>purrs</span></td>
<td style="text-align: center;"><span>tail</span></td>
<td style="text-align: center;"><span>wags</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">0</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">+0.52</span></td>
<td style="text-align: center;"><span class="inline_code">+0.78</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">+0.26</span></td>
<td style="text-align: center;"><span class="inline_code">+0.26</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">1</span></td>
<td style="text-align: center;"><span class="inline_code">-0.52</span></td>
<td style="text-align: center;"><span class="inline_code">-0.26</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">-0.26</span></td>
<td style="text-align: center;"><span class="inline_code">-0.78</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
</tr>
</tbody>
</table>
<table class="border">
<tbody>
<tr>
<td style="width: 12%; text-align: center;"><span class="smallcaps">concept</span></td>
<td style="text-align: center;"><span><span class="inline_code">d1</span> (cat1)</span></td>
<td style="text-align: center;"><span><span class="inline_code">d2</span> (cat2)</span></td>
<td style="text-align: center;"><span><span class="inline_code">d3</span> (dog1)</span></td>
<td style="text-align: center;"><span><span class="inline_code">d4</span> (dog2)</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">0</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">+0.45</span></td>
<td style="text-align: center;"><span class="inline_code">+0.90</span></td>
</tr>
<tr>
<td style="text-align: center;"><span class="inline_code">1</span></td>
<td style="text-align: center;"><span class="inline_code">-0.90</span></td>
<td style="text-align: center;"><span class="inline_code">-0.45</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
<td style="text-align: center;"><span class="inline_code">&nbsp;0.00</span></td>
</tr>
</tbody>
</table>
<p>Dimensionality reduction is useful with <span class="inline_code">Model.cluster()</span>. Clustering algorithms are exponentially slow (i.e., 3 nested <span class="inline_code">for</span>-loops). Clustering a model of a 1,000 documents with a 1,000 features takes a couple of minutes. However, it takes a couple of seconds to reduce this model to concept vectors with a 100 features, after which <em>k</em>-means clustering also runs in a couple of seconds.&nbsp;Note that document vectors are stored in sparse format (i.e., features with weight <span class="inline_code">0.0</span> are omitted), so it is often not necessary to reduce the model. Even if the model has a 1,000 features, each document might have no more than 5-10 features. To get an idea of the average document vector length:</p>
<p><span class="inline_code">sum(len(d.vector) for d in model.documents) / float(len(model)) </span></p>
<p>&nbsp;</p>
<hr />
<h2><a name="cluster"></a>Clustering</h2>
<p>Clustering is an unsupervised machine learning method that can be used to partition a set of unlabeled documents (i.e., <span class="inline_code">Document</span> objects without a <span class="inline_code">type</span>). Since the label (class, type, category) of a document is not known, clustering will attempt to create clusters (categories) of similar documents by measuring the distance between the document vectors. The optimal solution is then a set of <em>dense</em> clusters, where each cluster is made up of documents with the smallest possible distance between them.</p>
<p>Say we have a number of 2D points with&nbsp;coordinates <span class="inline_code">x</span> and <span class="inline_code">y</span> (horizontal and vertical position). Some points will be further apart than others. The figure below illustrates how we can partition the points by measuring their distance to two centroids. More centroids create more clusters. The principle holds for 3D points with&nbsp;<span class="inline_code">x</span>, <span class="inline_code">y</span>&nbsp;and&nbsp;<span class="inline_code">z</span>&nbsp;coordinates, or any n-D points&nbsp;(<span class="inline_code">x</span>, <span class="inline_code">y</span>, <span class="inline_code">z</span>, <span class="inline_code">...</span>, <span class="inline_code">n</span>). This is how the <em>k</em>-means clustering algorithm works. A <span class="inline_code">Document.vector</span> is an n-dimensional point. Instead of coordinates&nbsp;<span class="inline_code">x</span> and <span class="inline_code">y</span> it has <span class="inline_code">n</span> features (words) and feature weights. We can calculate the distance between document vectors with cosine similarity.</p>
<table class="border">
<tbody>
<tr>
<td style="text-align: center;"><img style="display: block; margin-left: auto; margin-right: auto;" src="../g/pattern-vector-cluster1.jpg" alt="" width="249" height="125" /><span class="smallcaps">random points in 2D</span></td>
<td style="text-align: center;"><img style="display: block; margin-left: auto; margin-right: auto;" src="../g/pattern-vector-cluster2.jpg" alt="" width="249" height="125" /><span class="smallcaps">points by distance to centroid</span></td>
</tr>
</tbody>
</table>
<p>The <span class="inline_code">Model.cluster()</span> method returns a list of clusters using the <span class="inline_code">KMEANS</span> or the <span class="inline_code">HIERARCHICAL</span> algorithm. The optional <span class="inline_code">distance</span> parameter can be <span class="inline_code">COSINE</span> (default), <span class="inline_code">EUCLIDEAN</span>, <span class="inline_code">MANHATTAN</span> or <span class="inline_code">HAMMING</span>. An optional <span class="inline_code">documents</span>&nbsp;parameter can be a selective list of documents in the model to cluster.</p>
<pre class="brush:python; gutter:false; light:true;">clusters = Model.cluster(method=KMEANS, k=10, iterations=10, distance=COSINE)</pre><pre class="brush:python; gutter:false; light:true;">clusters = Model.cluster(method=HIERARCHICAL, k=1, iterations=1000, distance=COSINE)</pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import Document, Model, HIERARCHICAL
&gt;&gt;&gt;
&gt;&gt;&gt; d1 = Document('Cats are independent pets.', name='cat')
&gt;&gt;&gt; d2 = Document('Dogs are trustworthy pets.', name='dog')
&gt;&gt;&gt; d3 = Document('Boxes are made of cardboard.', name='box')
&gt;&gt;&gt;
&gt;&gt;&gt; m = Model((d1, d2, d3))
&gt;&gt;&gt; print m.cluster(method=HIERARCHICAL, k=2)
Cluster([
Document(id=3, name='box'),
Cluster([
Document(id=2, name='dog'),
Document(id=1, name='cat')
])
])</pre></div>
<h3><em><a name="kmeans"></a>k</em>-means clustering</h3>
<p>The <em>k</em>-means clustering algorithm partitions a set of unlabeled documents into <em>k</em> clusters, using <em>k</em> random centroids. It returns a list containing&nbsp;<em>k</em> lists of similar documents.&nbsp;</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">Model.cluster(method=KMEANS, k=10, iterations=10, distance=COSINE, seed=RANDOM, p=0.8)</pre><p>The advantage of <em>k</em>-means is that it is fast. The drawback is that an optimal solution is not guaranteed, since the position of the centroids is random.&nbsp;Each iteration, the algorithm will swap documents between clusters to create denser clusters.&nbsp;</p>
<p>The optional <span class="inline_code">seed</span>&nbsp;parameter be <span class="inline_code">RANDOM</span> or&nbsp;<span class="inline_code">KMPP</span>. The <span class="inline_code">KMPP</span> or&nbsp;<em>k</em>-means++ initialization algorithm can be used to find better centroids. In many cases this is also faster. The optional parameter <span class="inline_code">p</span>&nbsp;sets the "relaxation" of the <em>k</em>-means algorithm. Relaxation is based on a mathematical trick called triangle inequality, where <span class="inline_code">p=0.5</span> is stable but slow and <span class="inline_code">p=1.0</span> is prone to errors but faster, especially for higher <span class="inline_code">k</span> and document vectors with many features (i.e., higher dimensionality).</p>
<p><span class="small"><span style="text-decoration: underline;">References</span>: <br />Arthur, D. (2007). <em>k-means++: the advantages of careful seeding. </em>SODA'07 Proceedings.<br />Elkan, C. (2003). <em>Using the Triangle Inequality to Accelerate k-Means. </em>ICML'03 Proceedings.</span></p>
<h3><a name="hierarchical"></a>Hierarchical clustering</h3>
<p>The hierarchical clustering algorithm returns a tree of nested clusters. The top level item is a <span class="inline_code">Cluster</span>, a mixed list of&nbsp;<span class="inline_code">Document</span> and (nested)&nbsp;<span class="inline_code">Cluster</span> objects.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">Model.cluster(method=HIERARCHICAL, k=1, iterations=1000, distance=COSINE)</pre><p>The advantage of hierarchical clustering is that the optimal solution is guaranteed. Each iteration, the algorithm will cluster the two nearest documents. The drawback is that it is slow.</p>
<p>A <span class="inline_code">Cluster</span> is a list of <span class="inline_code">Document</span>&nbsp;and <span class="inline_code">Cluster</span>&nbsp;objects, with some additional properties:</p>
<pre class="brush:python; gutter:false; light:true;">cluster = Cluster([])</pre><pre class="brush:python; gutter:false; light:true;">cluster.depth # Returns the maximum depth of nested clusters.
cluster.flatten(depth=1000) # Returns a flat list, down to the given depth.
cluster.traverse(visit=lambda cluster: None) </pre><div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.vector import Cluster
&gt;&gt;&gt;
&gt;&gt;&gt; cluster = Cluster((1, Cluster((2, Cluster((3, 4))))))
&gt;&gt;&gt; print cluster.depth
&gt;&gt;&gt; print cluster.flatten(1)
2
[1, 2, Cluster([3, 4])] </pre></div>
<p class="small">Note: the maximum recursion depth in Python is 1,000. For deeper clusters, raise <span class="inline_code">sys.setrecursionlimit()</span>.</p>
<h3>Centroid</h3>
<p>The <span class="inline_code">centroid()</span> function takes a <span class="inline_code">Cluster</span>, or a list of <span class="inline_code">Cluster</span>, <span class="inline_code">Document</span> and <span class="inline_code">Vector</span> objects, and returns the mean <span class="inline_code">Vector</span>. The <span class="inline_code">distance()</span> function returns the distance between two vectors. A common problem is that a cluster has no meaningful descriptive name. One solution is to calculate its centroid, and use the <span class="inline_code">Document.type</span> of the document vector(s) nearest to the centroid.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">centroid(vectors=[]) # Returns the mean Vector. </pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">distance(v1, v2, method=COSINE) # COSINE | EUCLIDEAN | MANHATTAN | HAMMING</pre><p>&nbsp;</p>
<hr />
<h2><a name="classification"></a>Classification</h2>
<p>Classification can be used to predict the label of an unlabeled document. More specifically, classification is a supervised machine learning method that uses labeled documents (i.e., <span class="inline_code">Document</span> objects with a <span class="inline_code">type</span>) as training examples to statistically predict the label (class, type) of new documents, based on their similarity to the training examples using a distance metric (e.g., cosine similarity). A <span class="inline_code">Document</span> is a bag-of-words representation of a text, i.e., unordered words + word count. The <span class="inline_code">Document.vector</span> maps the words (or features) to their weight (absolute or relative word count, tf-idf, ...). The weight of a word represents its relevancy in the text. So we can compare how similar two documents are by measuring if they have relevant words in common. Given an unlabeled document, a classifier yields the label of the most similar document(s) in its training set. This implies that a larger training set with more features (and less labels) gives better performance.</p>
<p>For example, if we have a corpus of product reviews (<em>training data</em>) for which the star rating of each product review is known (<em>labels</em>, e.g., ★★★☆☆ = 3), we can use it to predict the star rating of other reviews, based on common words (<em>features</em>) in the text. We could represent each review as a vector of adjectives (e.g., good, bad, awesome, awful, ...) since positive reviews (good, awesome) will most likely contain different adjectives than negative reviews (bad, awful).</p>
<p>The pattern.vector module implements four classification algorithms:</p>
<ul>
<li><span class="inline_code">&nbsp;NB</span>: <a href="https://en.wikipedia.org/wiki/Naive_Bayes_classifier" target="_blank">Naive Bayes</a>, based on the probability that a feature occurs in a class.</li>
<li><span class="inline_code">KNN</span>: <a href="https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm" target="_blank"><em>k</em>-nearest neighbor</a>, based on the <em>k</em> most similar documents in the training set.</li>
<li><span class="inline_code">SLP</span>: <a href="http://en.wikipedia.org/wiki/Perceptron" target="_blank">single-layer averaged perceptron</a>, based on an artificial neural network.</li>
<li><span class="inline_code">SVM</span>: <a href="https://en.wikipedia.org/wiki/Support_vector_machine" target="_blank">support vector machine</a>, based on a representation of the documents in a high-dimensional space separated by hyperplanes (see further).</li>
</ul>
<pre class="brush:python; gutter:false; light:true;">classifier = NB(train=[], baseline=MAJORITY, method=MULTINOMIAL, alpha=0.0001)</pre><pre class="brush:python; gutter:false; light:true;">classifier = KNN(train=[], baseline=MAJORITY, k=10, distance=COSINE)</pre><pre class="brush:python; gutter:false; light:true;">classifier = SLP(train=[], baseline=MAJORITY, iterations=1)</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">classifier = SVM(train=[], type=CLASSIFICATION, kernel=LINEAR)</pre><h3>Classifier</h3>
<p>The <span class="inline_code">NB</span>, <span class="inline_code">KNN</span>, <span class="inline_code">SLP</span> and <span class="inline_code">SVM</span> classifiers inherit from the <span class="inline_code">Classifier</span> base class:</p>
<pre class="brush:python; gutter:false; light:true;">classifier = Classifier(train=[], baseline=MAJORITY)</pre><pre class="brush:python; gutter:false; light:true;">classifier = Classifier.load(path)</pre><pre class="brush:python; gutter:false; light:true;">classifier.features # List of trained features (words).
classifier.classes # List of trained class labels.
classifier.binary # True if Classifier.classes == [True, False] or [0, 1].
classifier.distribution # Dictionary of (label, frequency)-items.
classifier.baseline # Default predicted class (most frequent or user-given).
classifier.majority # Most frequent class label.
classifier.minority # Least frequent class label.
classifier.skewness # 0.0 if the classes are evenly distributed.</pre><pre class="brush:python; gutter:false; light:true;">classifier.train(document, type=None)
classifier.classify(document, discrete=True)
</pre><pre class="brush:python; gutter:false; light:true;">classifier.confusion_matrix(documents=[])
classifier.test(documents=[], target=None)
classifier.auc(documents=[], k=10)
</pre><pre class="brush:python; gutter:false; light:true;">classifier.finalize() # Trains + removes training data from memory. </pre><pre class="brush:python; gutter:false; light:true;">classifier.save(path) # gzipped pickle file, load with Classifier.load().</pre><ul>
<li><span class="inline_code">Classifier.train()</span> trains the classifier with the given document and type (= class label).<br />A document can be a <span class="inline_code">Document</span>, <span class="inline_code">Vector</span>, <span class="inline_code">dict</span>, or a list or string of words (features).<br />If no <span class="inline_code">type</span> is given, <span class="inline_code">Document.type</span> will be used instead.<br />You can also use <span class="inline_code">Classifier(train=[document1</span><span class="inline_code">,</span> <span class="inline_code">document2</span><span class="inline_code">,</span> <span class="inline_code">...])</span> with a list or a <span class="inline_code">Model</span>.</li>
<li><span class="inline_code">Classifier.classify()</span> returns the type with the highest probability for the given document.<br />If <span class="inline_code">discrete=False</span>, returns a dictionary of (<span class="inline_code">class</span>, <span class="inline_code">probability</span>)-items.<br />If the classifier is trained on an LSA model, you must supply the output of <span class="inline_code">Model.lsa.transform()</span>.</li>
<li><span class="inline_code">Classifier.test()</span> returns an <span class="inline_code">(accuracy,</span> <span class="inline_code">precision,</span> <span class="inline_code">recall,</span> <span class="inline_code">F1-score)</span>-tuple.<br />The given test data can be a list of documents, <span class="inline_code">(document,</span> <span class="inline_code">type)</span>-tuples or a <span class="inline_code">Model</span>.</li>
</ul>
<h3>Training a classifier</h3>
<p>Say we have a corpus of a 1,000 short movie reviews (<a class="link-maintenance" href="http://www.clips.ua.ac.be/media/reviews.csv.zip">reviews.csv.zip</a>), each with a star rating given by the reviewer or customer. The corpus contains such instances as:</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Review</span></td>
<td style="text-align: center;"><span class="smallcaps">Rating</span></td>
</tr>
<tr>
<td><em>Amazing film!</em></td>
<td style="text-align: center;"><span class="inline_code">★★★★★</span></td>
</tr>
<tr>
<td><em>Pretty darn good</em></td>
<td style="text-align: center;"><span class="inline_code">★★★★☆</span></td>
</tr>
<tr>
<td><em>Rather disappointing</em></td>
<td style="text-align: center;"><span class="inline_code">★★☆☆☆</span></td>
</tr>
<tr>
<td><em>How can anyone watch this drivel?</em></td>
<td style="text-align: center;"><span class="inline_code">☆☆☆☆☆</span></td>
</tr>
</tbody>
</table>
<p>We can use the corpus to train a classifier that predicts the star rating of other reviews, based on word similarity. By creating a <span class="inline_code">Document</span> for each review we have control over what words (features) are included or not (e.g., stopwords). We will use a Naive Bayes (<span class="inline_code">NB</span>) classifier, but the examples will also work with <span class="inline_code">KNN</span> and <span class="inline_code">SVM</span>, since all classifiers inherit from <span class="inline_code">Classifier</span>.</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import Document, NB
&gt;&gt;&gt; from pattern.db import csv
&gt;&gt;&gt;
&gt;&gt;&gt; nb = NB()
&gt;&gt;&gt; for review, rating in csv('reviews.csv'):
&gt;&gt;&gt; v = Document(review, type=int(rating), stopwords=True)
&gt;&gt;&gt; nb.train(v)
&gt;&gt;&gt;
&gt;&gt;&gt; print nb.classes
&gt;&gt;&gt; print nb.classify(Document('A good movie!'))
[0, 1, 2, 3, 4, 5]
4 </pre></div>
<p>The review <em>"A good movie!"</em> is classified as ★★★★☆ because, based on the training data, the classifier learned that <em>good</em> is often related to higher star ratings.</p>
<h3>Testing a classifier</h3>
<p>How accurate is the classifier? Naive Bayes can be quite effective despite its simple implementation. In this example it has an accuracy of 60%. Given a set of testing data, <span class="inline_code">NB.test()</span> returns an <span class="inline_code">(accuracy,</span> <span class="inline_code">precision,</span> <span class="inline_code">recall,</span> <span class="inline_code">F1-score)</span>-tuple with values between <span class="inline_code">0.0</span><span class="inline_code">1.0</span>:</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">NB.test(documents=[], target=None) # Returns (accuracy, precision, recall, F1).</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; data = csv('reviews.csv')
&gt;&gt;&gt; data = [(review, int(rating)) for review, rating in data]
&gt;&gt;&gt; data = [Document(review, type=rating, stopwords=True) for review, rating in data]
&gt;&gt;&gt;
&gt;&gt;&gt; nb = NB(train=data[:500])
&gt;&gt;&gt;
&gt;&gt;&gt; accuracy, precision, recall, f1 = nb.test(data[500:])
&gt;&gt;&gt; print accuracy
0.60</pre></div>
<p>Note how we used 1/2 of the data for training and reserve the other 1/2 of the data for testing.</p>
<p class="smallcaps"><br />Binary classification</p>
<p>The reported accuracy (60%) is not the worst baseline. Random guessing between the six possible star ratings (0-5) has only 17% accuracy. Moreover, many errors are off by only one (e.g., predicts ★ instead of ★★ or vice versa). If we simplify the task and train a <em>binary</em> classifier that predicts either positive (<span class="inline_code">True</span> → star rating 3, 4, 5) or negative (<span class="inline_code">False</span> → star rating 0, 1, 2), accuracy increases to 68%. This is because we now have only two classes to choose from and more training data per class.</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; data = csv('reviews.csv')
&gt;&gt;&gt; data = [(review, int(rating) &gt;= 3) for review, rating in data]
&gt;&gt;&gt; data = [Document(review, type=rating, stopwords=True) for review, rating in data]
&gt;&gt;&gt;
&gt;&gt;&gt; nb = NB(train=data[:500])
&gt;&gt;&gt;
&gt;&gt;&gt; accuracy, precision, recall, f1 = nb.test(data[500:])
&gt;&gt;&gt; print accuracy
0.68</pre></div>
<p class="smallcaps"><br />Skewed data</p>
<p>The reported accuracy can be misleading. Suppose we have a classifier that <em>always</em> predicts positive (<span class="inline_code">True</span>). We evaluate it with a test set that contains 1/2 positive reviews. So accuracy is 50%. We then evaluate it with a test set that contains 9/10 positive reviews. Accuracy is now 90%. This happens if the data is skewed, i.e., when it has more instances of one class and fewer of the other.</p>
<p>A more reliable evaluation is to look at both the rate of correct predictions and incorrect predictions, per class. This information can be derived from the <em>confusion matrix</em>.</p>
<p class="smallcaps"><br />Confusion matrix</p>
<p>A <span class="inline_code">ConfusionMatrix</span> is a matrix of actual classes ×&nbsp;predicted classes, stored as a dictionary:</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">confusion = Classifier.confusion_matrix(documents=[])</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">confusion(target) # (TP, TN, FP, FN) for given class.
confusion.table # Pretty string output.</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; print nb.distribution
&gt;&gt;&gt; print nb.confusion_matrix(data[500:])
&gt;&gt;&gt; print nb.confusion_matrix(data[500:])(True) # (TP, TN, FP, FN)
{True: 373, False: 127}
{True: {True: 285, False: 94}, False: {False: 53, True: 68}}
(286, 53, 68, 93) </pre></div>
<table class="border">
<tbody>
<tr>
<td class="smallcaps" style="text-align: center;">Class</td>
<td class="smallcaps" style="text-align: center;" colspan="2">Predicted class</td>
</tr>
<tr>
<td>&nbsp;</td>
<td class="inline_code" style="text-align: center;">True</td>
<td class="inline_code" style="text-align: center;">False</td>
</tr>
<tr>
<td class="inline_code" style="text-align: center;">True</td>
<td style="text-align: center;">285</td>
<td style="text-align: center;">94</td>
</tr>
<tr>
<td class="inline_code" style="text-align: center;">False</td>
<td style="text-align: center;">68</td>
<td style="text-align: center;">53</td>
</tr>
</tbody>
</table>
<p>The class distribution shows that there are more positive reviews in the training data (373/500).</p>
<p>The confusion matrix shows that, by consequence, the classifier is good at predicting positive reviews (286/373 or 76%) but bad at predicting negative reviews (53/127 or 42%). Note how we call the <span class="inline_code">ConfusionMatrix</span> like a function. This returns a <span class="inline_code">(TP,</span> <span class="inline_code">TN,</span> <span class="inline_code">FP,</span> <span class="inline_code">FN)</span>-tuple for a given class, the amount of true positives ("hits"), true negatives ("rejects"), false positives ("errors") and false negatives ("misses").</p>
<p class="smallcaps"><br />Precision &amp; recall</p>
<p><strong>Precision</strong> is a measure of hits vs. errors. <strong>Recall</strong> is a measure of hits vs. misses. If the classifier has a low precision, negative cases are being misclassified as positive. If the classifier has a low recall, not all positive cases are being caught. F1-score is simply the harmonic mean of precision and recall.</p>
<p>Say we have an online shop that automatically highlights positive customer reviews. Negative reviews might contain profanity, so we want to focus on high precision to make sure that no swear words are highlighted. Say we hire a moderator to double-check highlighted reviews. In this case we can focus on high recall, to make sure that no positive review is overlooked. Our moderator will have to unhighlight some reviews by hand.</p>
<table class="border">
<tbody>
<tr>
<td class="smallcaps">Metric</td>
<td><span class="smallcaps">Formula</span></td>
</tr>
<tr>
<td>Accuracy</td>
<td><span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">TN)</span> <span class="inline_code">/</span> <span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">TN</span> <span class="inline_code">+</span> <span class="inline_code">FP</span> <span class="inline_code">+</span> <span class="inline_code">FN)</span></td>
</tr>
<tr>
<td>Precision</td>
<td><span class="inline_code">TP</span> <span class="inline_code">/</span> <span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">FP)</span></td>
</tr>
<tr>
<td>Recall</td>
<td><span class="inline_code">TP</span> <span class="inline_code">/</span> <span class="inline_code">(TP</span> <span class="inline_code">+</span> <span class="inline_code">FN)</span></td>
</tr>
<tr>
<td>F1-score</td>
<td><span class="inline_code">2</span> <span class="inline_code">x</span> <span class="inline_code">P</span> <span class="inline_code">x</span> <span class="inline_code">R</span> <span class="inline_code">/</span> <span class="inline_code">(P</span> <span class="inline_code">+</span> <span class="inline_code">R)</span></td>
</tr>
</tbody>
</table>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; print nb.test(data[500:], target=True)
&gt;&gt;&gt; print nb.test(data[500:], target=False)
&gt;&gt;&gt; print nb.test(data[500:])
(0.676, 0.807, 0.752, 0.779) # A, P, R, F1 for predicting True.
(0.676, 0.361, 0.438, 0.396) # A, P, R, F1 for predicting False.
(0.676, 0.584, 0.595, 0.589) # A, P, R, F1 (macro-averaged).</pre></div>
<p>In summary, the 59% F1-score is a more reliable estimate than the 68% accuracy.</p>
<p class="smallcaps"><br />K-fold cross-validation</p>
<p>K-fold cross-validation performs <em>K</em> tests on a given classifier, each time partitioning the given dataset into different subsets for training and testing, and returns the average <span class="inline_code">(accuracy,</span> <span class="inline_code">precision,</span> <span class="inline_code">recall,</span> <span class="inline_code">F1,</span> <span class="inline_code">stdev)</span>. This is more reliable (= generalized) than always using the same training data.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">kfoldcv(Classifier, documents=[], folds=10, target=None)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import NB, Document, kfoldcv
&gt;&gt;&gt; from pattern.db import csv
&gt;&gt;&gt;
&gt;&gt;&gt; data = csv('reviews.csv')
&gt;&gt;&gt; data = [(review, int(rating) &gt;= 3) for review, rating in data]
&gt;&gt;&gt; data = [Document(review, type=rating, stopwords=True) for review, rating in data]
&gt;&gt;&gt;
&gt;&gt;&gt; print kfoldcv(NB, data, folds=10)
(0.678, 0.563, 0.568, 0.565, 0.034) </pre></div>
<p>Note that <span class="inline_code">kfoldcv()</span> takes any parameters of the given <span class="inline_code">Classifier</span> as optional parameters:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; print kfoldcv(KNN, data, folds=10, k=3, distance=EUCLIDEAN)</pre></div>
<p>As it turns out, our Naive Bayes classifier is not that accurate: 57% F1-score. We will need more training data and/or be more selective about our data. How about we just take the adjectives and exclamation marks in each review instead of the whole text?</p>
<p>&nbsp;</p>
<hr />
<h3><a name="feature-selection"></a>Feature selection</h3>
<p>The performance of a classifier relies on the availability of training data, and the quality of each document in the training data. The <span class="inline_code">Document.vector</span> may contain redundant or irrelevant features that reduce performance, or it may be missing features. Useful techniques that may increase performance include:</p>
<ul>
<li>Filter out noise. Raise the word count threshold with <span class="inline_code">Document(threshold=0)</span>.</li>
<li>Use <a class="link-maintenance" href="pattern-en.html#parser">part-of-speech tagging</a> to select specific types of words (e.g., adjectives, punctuation, ...)</li>
<li>Lemmatize features (<em>purred</em><em>purr</em>) with <a class="link-maintenance" href="pattern-en.html#parser">pattern.en</a>'s <span class="inline_code">parse(lemmata=True)</span>.</li>
<li>Use <a class="link-maintenance" href="pattern-en.html#ngram">ngrams</a> or <span class="inline_code">chngrams()</span> as features.</li>
</ul>
<p>Note that you can pass a custom dictionary of <span class="inline_code">(feature,</span> <span class="inline_code">weight)</span>-items to the <span class="inline_code">Document()</span> constructor, instead of a string. You can also pass dictionaries directly to <span class="inline_code">Classifier.train()</span>.</p>
<p>The following example improves the accuracy of our movie review classifier from 57% to 60% by selecting lemmatized adjectives (<span class="postag">JJ</span>), nouns (<span class="inline_code">NN</span>), verbs (<span class="postag">VB</span>) and exclamation marks from each review:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import NB, kfoldcv, count
&gt;&gt;&gt; from pattern.db import csv
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; def v(review):
&gt;&gt;&gt; v = parsetree(review, lemmata=True)[0]
&gt;&gt;&gt; v = [w.lemma for w in v if w.tag.startswith(('JJ', 'NN', 'VB', '!'))]
&gt;&gt;&gt; v = count(v)
&gt;&gt;&gt; return v
&gt;&gt;&gt;
&gt;&gt;&gt; data = csv('reviews.csv')
&gt;&gt;&gt; data = [(v(review), int(rating) &gt;= 3) for review, rating in data]
&gt;&gt;&gt;
&gt;&gt;&gt; print kfoldcv(NB, data)
(0.631, 0.588, 0.626, 0.606, 0.044) </pre></div>
<p>Features can be selected automatically using <span class="inline_code">Model.infogain(feature)</span>. Information gain is a measure of a feature's predictability for a class label (<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>). Some features will occur more frequently in documents with a certain class label (e.g., <em>awesome</em>&nbsp;→ positive reviews, <em>awful</em>&nbsp;→ negative reviews), hence they are more "informative" than features that occur in all documents, such as <em>the</em> and <em>you</em>.</p>
<p>This value is used in <span class="inline_code">Model.feature_selection()</span> to compute a sorted list of the most informative features. An optional document frequency <span class="inline_code">threshold</span> parameter (<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>) excludes features that only occur in a few documents (i.e., outliers).</p>
<p>Automatic feature selection is useful for documents with many features (e.g., 10,000). More features require more computation and can lead to <em>overfitting</em>. Overfitting means that the classifier is making assumptions based on irrelevant features (noise). It memorizes the training data instead of generalizing from trends.</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import Model, Document, BINARY, NB, kfoldcv
&gt;&gt;&gt; from pattern.db import csv
&gt;&gt;&gt;
&gt;&gt;&gt; data = csv('reviews.csv')
&gt;&gt;&gt; data = [(review, int(rating) &gt;= 3) for review, rating in data]
&gt;&gt;&gt; data = [Document(review, type=rating, stopwords=True) for review, rating in data]
&gt;&gt;&gt;
&gt;&gt;&gt; model = Model(documents=data, weight=TF)
&gt;&gt;&gt; model = model.filter(features=model.feature_selection(top=1000))
&gt;&gt;&gt;
&gt;&gt;&gt; print kfoldcv(NB, model)
(0.769, 0.689, 0.639, 0.662, 0.043)</pre></div>
<p>&nbsp;</p>
<hr />
<h3><a name="nb"></a>Naive bayes</h3>
<p>The Naive Bayes classifier is based on the probability that a feature occurs in a class, independent of other features, using Bayes' theorem.</p>
<div>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">classifier = NB(train=[], baseline=MAJORITY, method=MULTINOMIAL, alpha=0.0001)</pre></div>
<p>With the <span class="inline_code">MULTINOMIAL</span> method, feature weights are used (<span class="inline_code">0.0</span><span class="inline_code">1.0</span>). With the <span class="inline_code">BINOMIAL</span> method, a feature is part of a document (<span class="inline_code">1</span>) or not (<span class="inline_code">0</span>). The <span class="inline_code">alpha</span> value is used to avoid a division by zero. If <span class="inline_code">NB.classify()</span>&nbsp;is unable to classify a document, it returns the&nbsp;<span class="inline_code">baseline</span> (by default, the most frequent class).</p>
<p>&nbsp;</p>
<hr />
<h3><em><a name="knn"></a>k</em>-nearest neighbor</h3>
<p>The <em>k</em>-nearest neighbor classifier is based on the <em>k</em> most similar documents in the training data, given some distance function for calculating similarity.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">classifier = KNN(train=[], baseline=MAJORITY, k=10, distance=COSINE)</pre><p class="example">The given <span class="inline_code">distance</span> can be <span class="inline_code">COSINE</span>, <span class="inline_code">EUCLIDEAN</span>, <span class="inline_code">MANHATTAN</span> or <span class="inline_code">HAMMING</span>, or a user-given function that takes two dictionaries of <span class="inline_code">(feature,</span> <span class="inline_code">weight)</span>-items and returns a value between <span class="inline_code">0.0</span><span class="inline_code">1.0</span>. If <span class="inline_code">KNN.classify()</span> is unable to classify a document, it returns the <span class="inline_code">baseline</span> (by default, the most frequent class).</p>
<p class="example">&nbsp;</p>
<hr />
<h3 class="example"><a name="SLP"></a>Single-layer averaged perceptron</h3>
<p class="example">The perceptron classifier is a simple artificial neural network (ANN), based on weighted connections whose weights are iteratively fine-tuned during training.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">classifier = SLP(train=[], baseline=MAJORITY, iterations=1)</pre><p class="example">Accuracy improves with more <span class="inline_code">iterations</span> (e.g., 3-4) over the training documents. Feature weights in each document are expected to be binary (<span class="inline_code">0</span> or <span class="inline_code">1</span>). If <span class="inline_code">SLP.classify()</span> is unable to classify a document, it returns the <span class="inline_code">baseline</span> (by default, the most frequent class).</p>
<p>&nbsp;</p>
<hr />
<h3><a name="svm"></a>Support vector machine</h3>
<p class="example">The SVM classifier is based on a representation of the documents in a high-dimensional space (e.g., 2D, 3D, ...) separated by hyperplanes.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">classifier = SVM(type=CLASSIFICATION, kernel=LINEAR, train=[], **kwargs)</pre><p class="example">The given <span class="inline_code">type</span> can be <span class="inline_code">CLASSIFICATION</span> or <span class="inline_code">REGRESSION</span>. <br />The given <span class="inline_code">kernel</span> can be <span class="inline_code">LINEAR</span>, <span class="inline_code">POLYNOMIAL</span> or <span class="inline_code">RADIAL</span>.</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Kernel</span></td>
<td><span class="smallcaps">Separation</span></td>
<td><span class="smallcaps">Function</span></td>
</tr>
<tr>
<td><span class="inline_code">LINEAR</span></td>
<td>straight line</td>
<td><span class="inline_code">u' * v</span></td>
</tr>
<tr>
<td><span class="inline_code">POLYNOMIAL</span></td>
<td>curved line</td>
<td><span class="inline_code">(gamma * u' * v + coeff0) ** degree</span></td>
</tr>
<tr>
<td><span class="inline_code">RADIAL</span></td>
<td>curved path</td>
<td><span class="inline_code">exp(-gamma * abs(u-v) ** 2)</span></td>
</tr>
</tbody>
</table>
<p>Overview of optional parameters:</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Parameter</span></td>
<td><span class="smallcaps">Value</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td><span class="inline_code">type</span></td>
<td><span class="inline_code">CLASSIFICATION</span>, <span class="inline_code">REGRESSION</span></td>
<td><span class="inline_code">REGRESSION</span> returns a float value.</td>
</tr>
<tr>
<td><span class="inline_code">kernel</span></td>
<td><span class="inline_code">LINEAR</span>, <span class="inline_code">POLYNOMIAL</span>, <span class="inline_code">RADIAL</span></td>
<td>Kernel function used for separation.</td>
</tr>
<tr>
<td><span class="inline_code">degree</span></td>
<td><span class="inline_code">3</span></td>
<td>Used in <span class="inline_code">POLYNOMIAL</span> kernel.</td>
</tr>
<tr>
<td><span class="inline_code">gamma</span></td>
<td><span class="inline_code">1</span> <span class="inline_code">/</span> <span class="inline_code">len(SVM.features)</span></td>
<td>Used in <span class="inline_code">POLYNOMIAL</span> and <span class="inline_code">RADIAL</span> kernel.</td>
</tr>
<tr>
<td><span class="inline_code">coeff0</span></td>
<td><span class="inline_code">0</span></td>
<td>Used in <span class="inline_code">POLYNOMIAL</span> kernel.</td>
</tr>
<tr>
<td><span class="inline_code">cost</span></td>
<td><span class="inline_code">1</span></td>
<td>Soft margin for training errors.</td>
</tr>
<tr>
<td><span class="inline_code">epsilon</span></td>
<td>0.1</td>
<td>Tolerance for termination criterion.</td>
</tr>
<tr>
<td><span class="inline_code">cache</span></td>
<td>100</td>
<td>Cache memory size in MB.</td>
</tr>
<tr>
<td><span class="inline_code"><span class="inline_code">probability</span></span></td>
<td class="inline_code">False</td>
<td><span class="inline_code">CLASSIFICATION</span> yields <span class="inline_code">(weight,</span> <span class="inline_code">class)</span> values.</td>
</tr>
</tbody>
</table>
<p class="example">The SVM classifier uses kernel functions to divide the high-dimensional space. The simplest way to divide two clusters of points in 2D is a straight line (<span class="inline_code">LINEAR</span>). As illustrated below, moving the points to a higher dimensional (<span class="inline_code">POLYNOMIAL</span> or <span class="inline_code">RADIAL</span>) can make separation easier (using hyperplanes). The optional <span class="inline_code">degree</span>, <span class="inline_code">gamma</span>, <span class="inline_code">coeff0</span> and <span class="inline_code">cost</span> can be used to tweak the kernel function.</p>
<table class="border">
<tbody>
<tr>
<td style="text-align: center;"><img style="display: block; margin-left: auto; margin-right: auto;" src="../g/pattern-vector-svm1.jpg" alt="" width="178" height="148" /><span class="smallcaps">complex in low dimension</span></td>
<td style="text-align: center;"><img style="display: block; margin-left: auto; margin-right: auto;" src="../g/pattern-vector-svm2.jpg" alt="" width="190" height="148" /><span class="smallcaps">simple in higher dimension</span></td>
</tr>
</tbody>
</table>
<p class="smallcaps"><br />Gridsearch</p>
<p>Different settings for <span class="inline_code">degree</span>, <span class="inline_code">gamma</span>, <span class="inline_code">coeff0</span> and <span class="inline_code">cost</span> yield better or worse performance. Which settings to use? The <span class="inline_code">gridsearch()</span> function returns the K-fold cross-validation test results for every possible combination of optional parameters (given as lists of values):</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">gridsearch(Classifier, documents=[], folds=10, **kwargs)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import SVM, RADIAL, gridsearch, kfoldcv, count
&gt;&gt;&gt; from pattern.db import csv
&gt;&gt;&gt;
&gt;&gt;&gt; data = csv('reviews.csv')
&gt;&gt;&gt; data = [(count(review), int(rating) &gt;= 3) for review, rating in data]
&gt;&gt;&gt;
&gt;&gt;&gt; for (A, P, R, F, o), p in gridsearch(SVM, data, kernel=[RADIAL], gamma=[0.1,1,10]):
&gt;&gt;&gt; print (A, P, R, F, o), p
(0.756, 0.679, 0.517, 0.578, 0.091) {'kernel': RADIAL, 'gamma': 0.1}
(0.753, 0.452, 0.503, 0.465, 0.078) {'kernel': RADIAL, 'gamma': 1}
(0.753, 0.477, 0.503, 0.474, 0.093) {'kernel': RADIAL, 'gamma': 10} </pre></div>
<p>A (faster) poor man's linear SVM often produces results that are equally accurate:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; print kfoldcv(SVM, data, folds=10)
(0.741, 0.617, 0.537, 0.570, 0.083) </pre></div>
<p><br /><span class="smallcaps">Libsvm and Liblinear</span></p>
<p>The SVM implementation in Pattern relies on the <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvm/">LIBSVM</a> and <a href="http://www.csie.ntu.edu.tw/~cjlin/liblinear/">LIBLINEAR</a> C++ libraries. Precompiled bindings are included for Windows, Mac OS X and Ubuntu. These may not work on your system. In this case you need to compile the bindings from source (see the instructions in <span class="inline_code">pattern/vector/svm/INSTALL.txt</span>).</p>
<p class="small"><span style="text-decoration: underline;">Reference</span>: Chang, C.-C., Lin, C.-J. (2011). LIBSVM: a library for support vector machines. <em>ACM</em>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="ga"></a>Genetic algorithm</h2>
<p>A <span class="inline_code">GA</span> or genetic algorithm is an optimization technique based on evolution by natural selection. With each <span class="inline_code">GA.update()</span>, the fittest candidates (e.g., lists or objects) are selected and recombined into a new generation, converging towards optimal fitness. GA's can be used for automatic <a class="link-maintenance" href="#feature-selection">feature selection</a>, for example.</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">ga = GA(candidates=[])</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">ga.population # List of candidates.
ga.generation # Current generation (int).
ga.avg # Average population fitness.</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">ga.fitness(candidate)
ga.combine(candidate1, candidate2)
ga.mutate(candidate)
</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">ga.update(top=0.5, mutation=0.5)</pre><p>The <span class="inline_code">GA.fitness()</span>, <span class="inline_code">combine()</span> and&nbsp;<span class="inline_code">mutate()</span> methods must be defined in a subclass.</p>
<ul>
<li><span class="inline_code">GA.fitness()</span>&nbsp;returns the given candidate's fitness as a value (<span class="inline_code">0.0</span><span class="inline_code">1.0</span>).</li>
<li><span class="inline_code">GA.combine()</span> returns a new candidate that is a combination of the given candidates.</li>
<li><span class="inline_code">GA.mutate()</span>&nbsp;returns a new candidate that is a mutation of&nbsp;the given candidate.</li>
<li><span class="inline_code">GA.update()</span> populates <span class="inline_code">GA.population</span> with a new generation of candidates,<br />each a combination of the <span class="inline_code">top</span> fittest candidates with a chance of <span class="inline_code">mutation</span> (<span class="inline_code">0.5</span> = 50%).</li>
</ul>
<p>The following GA converges from random character sequences to neologisms (invented words).</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.vector import GA, chngrams
&gt;&gt;&gt; from pattern.en import lexicon
&gt;&gt;&gt; from random import choice
&gt;&gt;&gt;
&gt;&gt;&gt; def chseq(length=4, chars='abcdefghijklmnopqrstuvwxyz'):
&gt;&gt;&gt; # Returns a string of random characters.
&gt;&gt;&gt; return ''.join(choice(chars) for i in range(length))
&gt;&gt;&gt;
&gt;&gt;&gt; class Jabberwocky(GA):
&gt;&gt;&gt;
&gt;&gt;&gt; def fitness(self, w):
&gt;&gt;&gt; return sum(0.2 for ch in chngrams(w, 4) if ch in lexicon) + \
&gt;&gt;&gt; sum(0.1 for ch in chngrams(w, 3) if ch in lexicon)
&gt;&gt;&gt;
&gt;&gt;&gt; def combine(self, w1, w2):
&gt;&gt;&gt; return w1[:len(w1)/2] + w2[len(w2)/2:] # cut-and-splice
&gt;&gt;&gt;
&gt;&gt;&gt; def combine(self, w):
&gt;&gt;&gt; returns w.replace(choice(w), chseq(1), 1)
&gt;&gt;&gt;
&gt;&gt;&gt; # Start with 10 strings, each 8 random characters.
&gt;&gt;&gt; candidates = [''.join(chseq(8)) for i in range(10)]
&gt;&gt;&gt;
&gt;&gt;&gt; ga = Jabberwocky(candidates)
&gt;&gt;&gt; i = 0
&gt;&gt;&gt; while ga.avg &lt; 1.0 and i &lt; 1000:
&gt;&gt;&gt; ga.update(top=0.5, mutation=0.3)
&gt;&gt;&gt; i += 1
&gt;&gt;&gt;
&gt;&gt;&gt; print ga.population
&gt;&gt;&gt; print ga.generation
&gt;&gt;&gt; print ga.avg</pre></div>
<p>In this example we are interested in creative language use.&nbsp;The GA's fitness function promotes substrings of 34 characters that are real words, ensuring that the invented words have a familiar feel.&nbsp;For example, <em>spingrsh</em> is not a real word, but <em>spin</em>, <em>pin</em> and <em>ping</em> are (<span class="inline_code">+0.7</span>). After a random mutation that replaces <em>r</em> with <em>a</em>, <em>spingash</em> also contains&nbsp;<em>gas</em> and <em>gash</em>, raising its fitness (<span class="inline_code">+1.0</span>).&nbsp;</p>
<p>By randomly combining sequences, we then end up with invented words such as <em>spingash</em>, <em>skidspat</em>, <em>galagush</em>,&nbsp;<em>halfetee</em>,&nbsp;<em>clubelle</em>, and <em>sodasham</em>.<br />&nbsp;</p>
<p class="small" style="text-align: left;"><em>The spingashes galagushed and the halfetees rupeeked,</em></p>
<p class="small" style="text-align: left;"><em>&nbsp; &nbsp;An oofundoo sloboored.</em></p>
<p class="small" style="text-align: left;"><em></em><em>The showshope skidspatted and the otherbits did dadampsi,</em></p>
<p class="small" style="text-align: left;"><em>&nbsp; &nbsp;And the willsage widskits bratslared.</em></p>
<p class="small" style="text-align: left;"><em></em>&nbsp;</p>
<hr />
<h2>See also</h2>
<ul>
<li><a href="http://orange.biolab.si/" target="_blank">Orange</a> (GPL): d<span>ata mining &amp; machine learning in Python, with a node-based GUI.</span></li>
<li><span><a href="http://pybrain.org/" target="_blank">PyBrain</a> (BSD): p</span><span>owerful machine learning algorithms in Python + C.</span></li>
<li><a href="http://www.scipy.org/" target="_blank">SciPy</a><span> (BSD): scientific computing tools for Python.</span></li>
<li><span><a href="http://scikit-learn.org/" target="_blank">scikit-learn</a> (BSD): machine learning algorithms tightly knit with numpy, scipy, matplotlib.</span></li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,952 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern-web</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern-web" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern-web</a></div>
<h1>pattern.web</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1355" class="node node-type-page"><div class="node-inner">
<div class="content">
<p class="big">The pattern.web module has tools for online data mining: asynchronous requests, a uniform API for web services (Google, Bing, Twitter, Facebook, Wikipedia, Wiktionary, Flickr, RSS), a HTML DOM parser, HTML tag stripping functions, a web crawler, webmail, caching, Unicode support.</p>
<p>It can be used by itself or with other <a href="pattern.html">pattern</a> modules: web | <a href="pattern-db.html">db</a> | <a href="pattern-en.html">en</a> | <a href="pattern-search.html">search</a> | <a href="pattern-vector.html">vector</a> | <a href="pattern-graph.html">graph</a>.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Documentation</h2>
<ul>
<li><a href="#URL">URLs</a></li>
<li><a href="#asynchronous">Asynchronous requests</a></li>
<li><a href="#services">Search engine + web services</a> <span class="smallcaps link-maintenance">(<a href="#google">google</a>, <a href="#google">bing</a>,&nbsp;<a href="#twitter">twitter</a>, <a href="#facebook">facebook</a>, <a href="#wikipedia">wikipedia</a>, flickr)</span></li>
<li><a href="#sort">Web sort</a></li>
<li><a href="#plaintext">HTML to plaintext</a></li>
<li><a href="#DOM">HTML DOM parser</a></li>
<li><a href="#pdf">PDF parser</a></li>
<li><a href="#crawler">Crawler</a></li>
<li><a href="#mail">E-mail</a></li>
<li><a href="#locale">Locale</a></li>
<li><a href="#cache">Cache</a></li>
</ul>
<p>&nbsp;</p>
<hr />
<h2><a name="URL"></a>URLs</h2>
<p>The <span class="inline_code">URL</span> object is a subclass of Python's <span class="inline_code">urllib2.Request</span> that can be used to connect to a web address. The <span class="inline_code">URL.download()</span> method can be used to retrieve the content (e.g., HTML source code). The constructor's <span class="inline_code">method</span> parameter defines how <span class="inline_code">query</span> data is encoded:</p>
<ul>
<li><span class="inline_code">GET</span>: query data is encoded in the URL string (usually for retrieving data).</li>
<li><span class="inline_code">POST</span>: query data is encoded in the message body (for posting data).</li>
</ul>
<pre class="brush:python; gutter:false; light:true;">url = URL(string='', method=GET, query={})
</pre><pre class="brush:python; gutter:false; light:true;">url.string # u'http://user:pw@domain.com:30/path/page?p=1#anchor'
url.parts # Dictionary of attributes:</pre><pre class="brush:python; gutter:false; light:true;">url.protocol # u'http'
url.username # u'user'
url.password # u'pw'
url.domain # u'domain.com'
url.port # 30
url.path # [u'path']
url.page # u'page'
url.query # {u'p': 1}
url.querystring # u'p=1'
url.anchor # u'anchor'</pre><pre class="brush:python; gutter:false; light:true;">url.exists # False if URL.open() raises a HTTP404NotFound.
url.redirect # Actual URL after redirection, or None.
url.headers # Dictionary of HTTP response headers.
url.mimetype # Document MIME-type.</pre><pre class="brush:python; gutter:false; light:true;">url.open(timeout=10, proxy=None)
url.download(timeout=10, cached=True, throttle=0, proxy=None, unicode=False)
url.copy() </pre><ul>
<li><span class="inline_code">URL()</span> expects a string that starts with a valid protocol (e.g. <span class="inline_code">http://</span>).<span class="inline_code"> </span></li>
<li><span class="inline_code">URL.open()</span> returns a connection from which data can be retrieved with <span class="inline_code">connection.read()</span>.</li>
<li><span class="inline_code">URL.download()</span> caches and returns the retrieved data. <br />It raises a <span class="inline_code">URLTimeout</span>&nbsp;if the download time exceeds the given <span class="inline_code">timeout</span>.<br />It sleeps for <span class="inline_code">throttle</span> seconds after the download is complete.<br />A proxy server can be given as a <span class="inline_code">(host,</span> <span class="inline_code">protocol)</span>-tuple, e.g., <span class="inline_code">('proxy.com',</span> <span class="inline_code">'https')</span>.<br />With <span class="inline_code">unicode=True</span>, returns the data as a Unicode string. By default it is <span class="inline_code">False</span> because the data can be binary (e.g., JPEG, ZIP) but <span class="inline_code">unicode=True</span> is advised for HTML.</li>
</ul>
<p>The example below downloads an image. <br />The <span class="inline_code">extension()</span> helper function parses the file extension from a file name:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import URL, extension
&gt;&gt;&gt;
&gt;&gt;&gt; url = URL('http://www.clips.ua.ac.be/media/pattern_schema.gif')
&gt;&gt;&gt; f = open('test' + extension(url.page), 'wb') # save as test.gif
&gt;&gt;&gt; f.write(url.download())
&gt;&gt;&gt; f.close()</pre></div>
<h3>URL downloads</h3>
<p>The <span class="inline_code">download()</span> function takes a URL string, calls <span class="inline_code">URL.download()</span> and returns the retrieved data. It takes the same optional parameters as <span class="inline_code">URL.download()</span>. This saves you a line of code.</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import download
&gt;&gt;&gt; html = download('http://www.clips.ua.ac.be/', unicode=True)</pre></div>
<h3>URL mime-type</h3>
<p>The <span class="inline_code">URL.mimetype</span> can be used to check the type of document at the given URL. This is more reliable than sniffing the filename extension (which may be omitted).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern import URL, MIMETYPE_IMAGE
&gt;&gt;&gt;
&gt;&gt;&gt; url = URL('http://www.clips.ua.ac.be/media/pattern_schema.gif')
&gt;&gt;&gt; print url.mimetype in MIMETYPE_IMAGE
True</pre></div>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Global</span></td>
<td><span class="smallcaps">Value</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_WEBPAGE</span></td>
<td><span class="inline_code">['text/html']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_STYLESHEET</span></td>
<td><span class="inline_code">['text/css']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_PLAINTEXT</span></td>
<td><span class="inline_code">['text/plain']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_PDF</span></td>
<td><span class="inline_code">['application/pdf']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_NEWSFEED</span></td>
<td><span class="inline_code">['application/rss+xml', 'application/atom+xml']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_IMAGE</span></td>
<td><span class="inline_code">['image/gif', 'image/jpeg', 'image/png']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_AUDIO</span></td>
<td><span class="inline_code">['audio/mpeg', 'audio/mp4', 'audio/x-wav']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_VIDEO</span></td>
<td><span class="inline_code">['video/mpeg', 'video/mp4', 'video/avi', 'video/quicktime']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_ARCHIVE</span></td>
<td><span class="inline_code">['application/x-tar', 'application/zip']</span></td>
</tr>
<tr>
<td><span class="inline_code">MIMETYPE_SCRIPT</span></td>
<td><span class="inline_code">['application/javascript']</span></td>
</tr>
</tbody>
</table>
<h3>URL exceptions</h3>
<p>The <span class="inline_code">URL.open()</span> and <span class="inline_code">URL.download()</span> methods raise a <span class="inline_code">URLError</span> if an error occurs (e.g., no internet connection, server is down). <span class="inline_code">URLError</span> has a number of subclasses:</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Exception</span></td>
<td><span class="smallcaps">Description</span></td>
</tr>
<tr>
<td><span class="inline_code">URLError</span></td>
<td>URL has errors (e.g. a missing <span class="inline_code">t</span> in <span class="inline_code">htp://</span>)</td>
</tr>
<tr>
<td><span class="inline_code">URLTimeout</span></td>
<td>URL takes too long to load.</td>
</tr>
<tr>
<td><span class="inline_code">HTTPError</span></td>
<td>URL causes an error on the contacted server.</td>
</tr>
<tr>
<td><span class="inline_code">HTTP301Redirect</span></td>
<td>URL causes too many redirects.</td>
</tr>
<tr>
<td><span class="inline_code">HTTP400BadRequest</span></td>
<td>URL contains an invalid request.</td>
</tr>
<tr>
<td><span class="inline_code">HTTP401Authentication</span></td>
<td>URL requires a login and a password.</td>
</tr>
<tr>
<td><span class="inline_code">HTTP403Forbidden</span></td>
<td>URL is not accessible (check user-agent).</td>
</tr>
<tr>
<td><span class="inline_code">HTTP404NotFound</span></td>
<td>URL doesn't exist.</td>
</tr>
<tr>
<td><span class="inline_code">HTTP500InternalServerError</span></td>
<td>URL causes an error (bug?) on the server.</td>
</tr>
</tbody>
</table>
<h3>User-agent and referrer</h3>
<p>The <span class="inline_code">URL.open()</span> and <span class="inline_code">URL.download()</span> methods have two optional parameters <span class="inline_code">user_agent</span> and <span class="inline_code">referrer</span>, which can be used to identify the application accessing the web. Some websites include code to block out any application except browsers. By setting a <span class="inline_code">user_agent</span> you can make the application appear as a browser. This is called <em>spoofing</em> and it is not encouraged, but sometimes necessary.</p>
<p>For example, to pose as a Firefox browser:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; URL('http://www.clips.ua.ac.be').download(user_agent='Mozilla/5.0')
</pre></div>
<h3>Find URLs</h3>
<p>The <span class="inline_code">find_urls()</span> function can be used to parse URLs from a text string. It will retrieve a list of links starting with <span class="inline_code">http://</span>, <span class="inline_code">https://</span>, <span class="inline_code">www.</span> and domain names ending with <span class="inline_code">.com</span>, <span class="inline_code">.org</span>. <span class="inline_code">.net</span>. It will detect and strip leading punctuation (open parens) and trailing punctuation (period, comma, close parens). Similarly, the <span class="inline_code">find_email()</span> function can be used to parse e-mail addresses from a string.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import find_urls
&gt;&gt;&gt; print find_urls('Visit our website (wwwclips.ua.ac.be)', unique=True)
['www.clips.ua.ac.be']
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="asynchronous"></a>Asynchronous requests</h2>
<p>The <span class="inline_code">asynchronous()</span> function can be used to execute a function "in the background" (i.e., threaded). It takes the function, its arguments and optional keyword arguments. It returns an <span class="inline_code">AsynchronousRequest</span> object that contains the function's return value (when done). The main program does not halt in the meantime.</p>
<pre class="brush:python; gutter:false; light:true;">request = asynchronous(function, *args, **kwargs)</pre><pre class="brush:python; gutter:false; light:true;">request.done # True when the function is done.
request.elapsed # Running time, in seconds.
request.value # Function return value when done (or None).
request.error # Function Exception (or None).
</pre><pre class="brush:python; gutter:false; light:true;">request.now() # Waits for function and returns its value.
</pre><p>The example below executes a Google query without halting the main program. Instead, it displays a "busy" message (e.g., a progress bar updated in the application's event loop) until <span class="inline_code">request.done</span>.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import asynchronous, time, Google
&gt;&gt;&gt;
&gt;&gt;&gt; request = asynchronous(Google().search, 'holy grail', timeout=4)
&gt;&gt;&gt; while not request.done:
&gt;&gt;&gt; time.sleep(0.1)
&gt;&gt;&gt; print 'busy...'
&gt;&gt;&gt; print request.value
</pre></div>
<p>There is no way to stop a thread. You are responsible for ensuring that the given function doesn't hang.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="services"></a>Search engine + web services</h2>
<p>The <span class="inline_code">SearchEngine</span> object has a number of subclasses that can be used to query different web services (e.g., Google, Wikipedia). <span class="inline_code">SearchEngine.search()</span>&nbsp;returns a list of <span class="inline_code">Result</span> objects for a given query string similar to a search field and a results page in a browser.</p>
<pre class="brush:python; gutter:false; light:true;">engine = SearchEngine(license=None, throttle=1.0, language=None)</pre><pre class="brush:python; gutter:false; light:true;">engine.license # Service license key.
engine.throttle # Time between requests (being nice to server).
engine.language # Restriction for Result.language (e.g., 'en').</pre><pre class="brush:python; gutter:false; light:true;">engine.search(query,
type = SEARCH, # SEARCH | IMAGE | NEWS
start = 1, # Starting page.
count = 10, # Results per page.
size = None # Image size: TINY | SMALL | MEDIUM | LARGE
cached = True) # Cache locally?</pre><p><span class="small"><span style="text-decoration: underline;">Note</span>: <span class="inline_code">SearchEngine.search()</span> takes the same optional parameters as <span class="inline_code">URL.download()</span>.</span></p>
<h3>Google, Bing, Twitter, Facebook, Wikipedia, Flickr</h3>
<p><span class="inline_code">SearchEngine</span> is subclassed by <span class="inline_code">Google</span>, <span class="inline_code">Yahoo</span>, <span class="inline_code">Bing</span>, <span class="inline_code">DuckDuckGo</span>, <span class="inline_code">Twitter</span>, <span class="inline_code">Facebook</span>, <span class="inline_code">Wikipedia</span>, <span class="inline_code">Wiktionary</span>, <span class="inline_code">Wikia</span>, <span class="inline_code">DBPedia</span>, <span class="inline_code">Flickr</span> and <span class="inline_code">Newsfeed</span>. The constructors take the same parameters:</p>
<pre class="brush:python; gutter:false; light:true;">engine = Google(license=None, throttle=0.5, language=None)</pre><pre class="brush:python; gutter:false; light:true;">engine = Bing(license=None, throttle=0.5, language=None)</pre><pre class="brush:python; gutter:false; light:true;">engine = Twitter(license=None, throttle=0.5, language=None)</pre><pre class="brush:python; gutter:false; light:true;">engine = Facebook(license=None, throttle=1.0, language='en')</pre><pre class="brush:python; gutter:false; light:true;">engine = Wikipedia(license=None, throttle=5.0, language=None)</pre><pre class="brush:python; gutter:false; light:true;">engine = Flickr(license=None, throttle=5.0, language=None)</pre><p>Each search engine has different settings for the <span class="inline_code">search()</span> method. For example, <span class="inline_code">Twitter.search()</span> returns up to 3000 results for a given query (30 queries with 100 results each, or 300 queries with 10 results each). It has a limit of 150 queries per 15 minutes. Each call to <span class="inline_code">search()</span> counts as one query.</p>
<table class="border">
<tbody>
<tr>
<td><span class="smallcaps">Engine</span></td>
<td><span class="smallcaps">type</span></td>
<td><span class="smallcaps">start</span></td>
<td><span class="smallcaps">count</span></td>
<td><span class="smallcaps">limit</span></td>
<td><span class="smallcaps">throttle</span></td>
</tr>
<tr>
<td><span class="inline_code">Google</span></td>
<td><span class="inline_code">SEARCH<sup>1</sup></span></td>
<td>1-100/<span class="inline_code">count</span></td>
<td>1-10</td>
<td><span class="smallcaps">paid</span></td>
<td>0.5</td>
</tr>
<tr>
<td><span class="inline_code">Bing</span></td>
<td><span class="inline_code">SEARCH</span> <span class="inline_code">|</span> <span class="inline_code">NEWS</span> <span class="inline_code">|</span> <span class="inline_code">IMAGE</span><sup>12</sup></td>
<td>1-1000/<span class="inline_code">count</span></td>
<td>1-50</td>
<td class="smallcaps">paid</td>
<td>0.5</td>
</tr>
<tr>
<td><span class="inline_code">Yahoo</span></td>
<td><span class="inline_code">SEARCH</span> <span class="inline_code">|</span> <span class="inline_code">NEWS</span> <span class="inline_code">|</span> <span class="inline_code">IMAGE</span><sup>13</sup></td>
<td>1-1000/<span class="inline_code">count</span></td>
<td>1-50</td>
<td class="smallcaps">paid</td>
<td>0.5</td>
</tr>
<tr>
<td><span class="inline_code">DuckDuckGo</span></td>
<td><span class="inline_code">SEARCH</span></td>
<td>1</td>
<td>-</td>
<td class="smallcaps">-</td>
<td>0.5</td>
</tr>
<tr>
<td><span class="inline_code">Twitter</span></td>
<td><span class="inline_code">SEARCH</span></td>
<td>1-3000/<span class="inline_code">count</span></td>
<td>1-100</td>
<td>600/hour</td>
<td>0.5</td>
</tr>
<tr>
<td><span class="inline_code">Facebook</span></td>
<td><span class="inline_code">SEARCH</span> <span class="inline_code">|</span> <span class="inline_code">NEWS</span></td>
<td>1</td>
<td>1-100</td>
<td>500/hour</td>
<td>1.0</td>
</tr>
<tr>
<td><span class="inline_code">Wikipedia</span></td>
<td><span class="inline_code">SEARCH</span></td>
<td>1</td>
<td>1</td>
<td>-</td>
<td>5.0</td>
</tr>
<tr>
<td><span class="inline_code">Wiktionary</span></td>
<td><span class="inline_code">SEARCH</span></td>
<td>1</td>
<td>1</td>
<td>-</td>
<td>5.0</td>
</tr>
<tr>
<td><span class="inline_code">Wikia</span></td>
<td><span class="inline_code">SEARCH</span></td>
<td>1</td>
<td>1</td>
<td>-</td>
<td>5.0</td>
</tr>
<tr>
<td><span class="inline_code">DBPedia</span></td>
<td><span class="inline_code">SPARQL</span></td>
<td>1+</td>
<td>1-1000</td>
<td>10/sec</td>
<td>1.0</td>
</tr>
<tr>
<td><span class="inline_code">Flickr<br /></span></td>
<td><span class="inline_code">IMAGE</span></td>
<td>1+</td>
<td>1-500</td>
<td>-</td>
<td>5.0</td>
</tr>
<tr>
<td><span class="inline_code">Newsfeed</span></td>
<td><span class="inline_code">NEWS</span></td>
<td>1</td>
<td>1+</td>
<td>?</td>
<td>1.0</td>
</tr>
</tbody>
</table>
<p><span class="small"><sup>1 </sup><span class="inline_code">Google</span>, <span class="inline_code">Bing</span> and <span class="inline_code">Yahoo</span> are paid services see further how to obtain a license key.<br /></span> <span class="small"><sup>2 </sup><span class="inline_code">Bing.search(type=NEWS)</span> has a <span class="inline_code">count</span> of 1-15.<br /></span> <span class="small"><sup>3 </sup><span class="inline_code">Yahoo.search(type=IMAGES)</span> has a <span class="inline_code">count</span> of 1-35.</span><br /> <span class="smallcaps"><br /><a name="license"></a>Web service license key</span></p>
<p>Some services require a license key. They may work without one, but this implies that you share a public license key (and query limit) with other users of the pattern.web module. If the query limit is exceeded, <span class="inline_code">SearchEngine.search()</span>&nbsp;raises a&nbsp;<span class="inline_code">SearchEngineLimitError</span>.</p>
<ul>
<li><span class="inline_code">Google</span> is a paid service ($1 for 200 queries), with a 100 free queries per day. When you obtain a license key (follow the link below), activate "Custom Search API" and "Translate API" under "Services" and look up the key under "API Access".</li>
<li><span class="inline_code">Bing</span> is a paid service ($1 for 500 queries), with a 5,000 free queries per month.</li>
<li><span class="inline_code">Yahoo</span> is a paid service ($1 for 1250 queries) that requires an OAuth key + secret, which can be passed as a tuple: <span class="inline_code">Yahoo(license=(key,</span> <span class="inline_code">secret))</span>.</li>
</ul>
<p>Obtain a license key: <a href="https://code.google.com/apis/console/" target="_blank">Google</a>, <a href="https://datamarket.azure.com/dataset/5BA839F1-12CE-4CCE-BF57-A49D98D29A44" target="_blank">Bing</a>, <a href="http://developer.yahoo.com/search/boss/" target="_blank">Yahoo</a>, <a href="https://apps.twitter.com/app/new" target="_blank">Twitter</a>, <a href="/pattern-facebook" target="_blank">Facebook</a>, <a href="http://www.flickr.com/services/api/keys/" target="_blank">Flickr</a>.<br /><span class="smallcaps"><br />Web service request throttle</span></p>
<p>A <span class="inline_code">SearchEngine.search()</span> request takes a minimum amount of time to complete, as outlined in the table above. This is intended as etiquette towards the server providing the service. Raise the <span class="inline_code">throttle</span> value if you plan to run multiple queries in batch.&nbsp;Wikipedia requests are especially intensive. If you plan to mine a lot of data from Wikipedia, download the <a href="http://en.wikipedia.org/wiki/Wikipedia:Database_download">Wikipedia database</a> instead.</p>
<p>&nbsp;</p>
<hr />
<h2>Search Engine results</h2>
<p><span class="inline_code">SearchEngine.search()</span>&nbsp;returns a list of <span class="inline_code">Result</span> objects. It has an additional <span class="inline_code">total</span> property, which is the total number of results available for the given query. Each <span class="inline_code">Result</span> is a dictionary with extra properties:</p>
<pre class="brush:python; gutter:false; light:true;">result = Result(url)</pre><pre class="brush:python; gutter:false; light:true;">result.url # URL of content associated with the given query.
result.title # Content title.
result.text # Content summary.
result.language # Content language.
result.author # For news items and images.
result.date # For news items.</pre><pre class="brush:python; gutter:false; light:true;">result.download(timeout=10, cached=True, proxy=None)
</pre><ul>
<li><span class="inline_code">Result.download()</span>&nbsp;takes the same optional parameters as <span class="inline_code">URL.download()</span>.</li>
<li>The attributes (e.g., <span class="inline_code">result.text</span>) are Unicode strings.</li>
</ul>
<p><a name="google"></a>For example:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Bing, SEARCH, plaintext
&gt;&gt;&gt;
&gt;&gt;&gt; engine = Bing(license=None) # Enter your license key.
&gt;&gt;&gt; for i in range(1,5):
&gt;&gt;&gt; for result in engine.search('holy handgrenade', type=SEARCH, start=i):
&gt;&gt;&gt; print repr(plaintext(result.text))
&gt;&gt;&gt; print
u"The Holy Hand Grenade of Antioch is a fictional weapon from ..."
u'Once the number three, being the third number, be reached, then ...'
</pre></div>
<p>Since <span class="inline_code">SearchEngine.search()</span> takes the same optional parameters as <span class="inline_code">URL.download()</span>&nbsp;it is easy to disable local caching, set a proxy server, a throttle (minimum time) or a timeout (maximum time).</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import Google
&gt;&gt;&gt;
&gt;&gt;&gt; engine = Google(license=None) # Enter your license key.
&gt;&gt;&gt; for result in engine.search('tim', cached=False, proxy=('proxy.com', 'https'))
&gt;&gt;&gt; print result.url
&gt;&gt;&gt; print result.text</pre></div>
<p><span class="smallcaps"><br />Image search</span></p>
<p>For <span class="inline_code">Flickr</span>, <span class="inline_code">Bing</span>&nbsp;and&nbsp;<span class="inline_code">Yahoo</span>, image URLs retrieved with <span class="inline_code">search(type=IMAGE)</span> can be filtered by setting the&nbsp;<span class="inline_code">size</span> to <span class="inline_code">TINY</span>, <span class="inline_code">SMALL</span>, <span class="inline_code">MEDIUM</span>, <span class="inline_code">LARGE</span> or <span class="inline_code">None</span> (any size). Images may be subject to copyright.</p>
<p>For <span class="inline_code">Flickr</span>, use <span class="inline_code">search(copyright=False)</span> to retrieve results with no copyright restrictions (either public domain or Creative Commons <a href="http://creativecommons.org/licenses/by-sa/2.0/">by-sa</a>).</p>
<p>For <span class="inline_code">Twitter</span>, each result has a <span class="inline_code">Result.profile</span> property with the URL to the user's profile picture.</p>
<p>&nbsp;</p>
<hr />
<h2>Google translate</h2>
<p><span class="inline_code">Google.translate()</span>&nbsp;returns the translated string in the given language.<br /><span class="inline_code">Google.identify()</span>&nbsp;returns a <span class="inline_code">(language</span> <span class="inline_code">code,</span> <span class="inline_code">confidence)</span>-tuple for a given string.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Google
&gt;&gt;&gt;
&gt;&gt;&gt; s = "C'est un lapin, lapin de bois. Quoi? Un cadeau."
&gt;&gt;&gt; g = Google()
&gt;&gt;&gt; print g.translate(s, input='fr', output='en', cached=False)
&gt;&gt;&gt; print g.identify(s)
u"It's a rabbit, wood. What? A gift."
(u'fr', 0.76) </pre></div>
<p>Remember to activate the Translate API in the <a href="https://code.google.com/apis/console" target="_blank">Google API Console</a>. Max. 1,000 characters per request.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="twitter"></a>Twitter search</h2>
<p>The <span class="inline_code">start</span> parameter of&nbsp;<span class="inline_code">Twitter.search()</span>&nbsp;takes an <span class="inline_code">int</span> (= the starting page, cfr. other search engines) or a <span class="inline_code">tweet.id</span>. If you create two <span class="inline_code">Twitter</span> objects, their result pages for a given query may not correspond, since new tweets become available more quickly than we can query pages. The best way is to pass the last seen tweet id:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import Twitter
&gt;&gt;&gt;
&gt;&gt;&gt; t = Twitter()
&gt;&gt;&gt; i = None
&gt;&gt;&gt; for j in range(3):
&gt;&gt;&gt; for tweet in t.search('win', start=i, count=10):
&gt;&gt;&gt; print tweet.text
&gt;&gt;&gt; print
&gt;&gt;&gt; i = tweet.id</pre></div>
<p>&nbsp;</p>
<hr />
<h2>Twitter streams</h2>
<p><span class="inline_code">Twitter.stream()</span>&nbsp;returns an endless, live stream of <span class="inline_code">Result</span> objects. A <span class="inline_code">Stream</span> is a Python list that accumulates each time <span class="inline_code">Stream.update()</span> is called:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import Twitter
&gt;&gt;&gt;
&gt;&gt;&gt; s = Twitter().stream('#fail')
&gt;&gt;&gt; for i in range(10):
&gt;&gt;&gt; time.sleep(1)
&gt;&gt;&gt; s.update(bytes=1024)
&gt;&gt;&gt; print s[-1].text if s else ''</pre></div>
<p>To clear the accumulated list, call <span class="inline_code">Stream.clear()</span>.</p>
<p>&nbsp;</p>
<hr />
<h2>Twitter trends</h2>
<p><span class="inline_code">Twitter.trends()</span>&nbsp;returns a list of 10 "trending topics":</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Twitter
&gt;&gt;&gt; print Twitter().trends(cached=False)
[u'#neverunderstood', u'Not Top 10', ...]</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="wikipedia"></a>Wikipedia articles</h2>
<p><span class="inline_code">Wikipedia.search()</span>&nbsp;returns a single <span class="inline_code">WikipediaArticle</span> for the given (case-sensitive) query, which is the title of an article. <span class="inline_code">Wikipedia.index()</span> returns an iterator over all article titles on Wikipedia. The <span class="inline_code">language</span> parameter of the&nbsp;<span class="inline_code">Wikipedia()</span>defines the language of the returned articles (by default it is&nbsp;<span class="inline_code">"en"</span>, which corresponds to <a href="http://en.wikipedia.org/" target="_blank">en.wikipedia.org</a>).</p>
<pre class="brush:python; gutter:false; light:true;">article = WikipediaArticle(title='', source='', links=[])</pre><pre class="brush:python; gutter:false; light:true;">article.source # Article HTML source.
article.string # Article plaintext unicode string.</pre><pre class="brush:python; gutter:false; light:true;">article.title # Article title.
article.sections # Article sections.
article.links # List of titles of linked articles.
article.external # List of external links.
article.categories # List of categories.
article.media # List of linked media (images, sounds, ...)
article.languages # Dictionary of (language, article)-items.
article.language # Article language (i.e., 'en').
article.disambiguation # True if it is a disambiguation page</pre><pre class="brush:python; gutter:false; light:true;">article.plaintext(**kwargs) # See plaintext() for parameters overview.
article.download(media, **kwargs)
</pre><p><span class="inline_code">WikipediaArticle.plaintext()</span>&nbsp;is similar to&nbsp;<span class="inline_code">plaintext()</span>, with special attention for MediaWiki markup. It strips metadata, infoboxes, table of contents, annotations, thumbnails and disambiguation links.</p>
<h3>Wikipedia article sections</h3>
<p><span class="inline_code">WikipediaArticle.sections</span>&nbsp;is a list of&nbsp;<span class="inline_code">WikipediaSection</span> objects. Each section has a title and a number of paragraphs that belong together.</p>
<pre class="brush:python; gutter:false; light:true;">section = WikipediaSection(article, title='', start=0, stop=0, level=1)</pre><pre class="brush:python; gutter:false; light:true;">section.article # WikipediaArticle parent.
section.parent # WikipediaSection this section is part of.
section.children # WikipediaSections belonging to this section.</pre><pre class="brush:python; gutter:false; light:true;">section.title # Section title.
section.source # Section HTML source.
section.string # Section plaintext unicode string.
section.content # Section string minus title.
section.level # Section nested depth (from 0).
section.links # List of titles of linked articles.
section.tables # List of WikipediaTable objects.</pre><p>The following example downloads a Wikipedia article and prints the title of each section, indented according to the section level:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Wikipedia
&gt;&gt;&gt;
&gt;&gt;&gt; article = Wikipedia().search('cat')
&gt;&gt;&gt; for section in article.sections:
&gt;&gt;&gt; print repr(' ' * section.level + section.title)
u'Cat'
u' Nomenclature and etymology'
u' Taxonomy and evolution'
u' Genetics'
u' Anatomy'
u' Behavior'
u' Sociability'
u' Grooming'
u' Fighting'
... </pre></div>
<h3>Wikipedia article tables</h3>
<p><span class="inline_code">WikipediaSection.tables</span>&nbsp;is a list of&nbsp;<span class="inline_code">WikipediaTable</span> objects. Each table has a title, headers and rows.</p>
<pre class="brush:python; gutter:false; light:true;">table = WikipediaTable(section, title='', headers=[], rows=[], source='')</pre><pre class="brush:python; gutter:false; light:true;">table.section # WikipediaSection parent.
table.source # Table HTML source.
table.title # Table title.
table.headers # List of table column headers.
table.rows # List of table rows, each a list of column values.</pre><p>&nbsp;</p>
<hr />
<h2><a name="wikia"></a>Wikia</h2>
<p><a href="http://www.wikia.com/" target="_blank">Wikia</a> is a free hosting service for thousands of wikis. <span class="inline_code">Wikipedia</span>, <span class="inline_code">Wiktionary</span> and <span class="inline_code">Wikia</span> all inherit the&nbsp;<span class="inline_code">MediaWiki</span> base class, so <span class="inline_code">Wikia</span> has the same methods and properties as <span class="inline_code">Wikipedia</span>. Its constructor takes the name of a domain on Wikia. Note the use of <span class="inline_code">Wikia.index()</span>, which returns an iterator over all available article titles:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import Wikia
&gt;&gt;&gt;
&gt;&gt;&gt; w = Wikia(domain='montypython')
&gt;&gt;&gt; for i, title in enumerate(w.index(start='a', throttle=1.0, cached=True)):
&gt;&gt;&gt; if i &gt;= 3:
&gt;&gt;&gt; break
&gt;&gt;&gt; article = w.search(title)
&gt;&gt;&gt; print repr(article.title)
u'Albatross'
u'Always Look on the Bright Side of Life'
u'And Now for Something Completely Different'</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="dbpedia"></a>DBPedia</h2>
<p><a href="http://dbpedia.org/About" target="_blank">DBPedia</a> is a database of structured information mined from Wikipedia and stored as (subject, predicate, object)-triples (e.g., <em>cat</em> <span class="postag">is-a</span> <em>animal</em>). DBPedia can be queried with <a href="http://www.w3.org/TR/rdf-sparql-query/" target="_blank">SPARQL</a>, where subject, predicate and/or object can be given as&nbsp;<span class="inline_code">?variables</span>. The&nbsp;<span class="inline_code">Result</span> objects in the list returned from <span class="inline_code">DBPedia.search()</span> have the variables as additional properties:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import DBPedia
&gt;&gt;&gt;
&gt;&gt;&gt; sparql = '\n'.join((
&gt;&gt;&gt; 'prefix dbo: &lt;http://dbpedia.org/ontology/&gt;',
&gt;&gt;&gt; 'select ?person ?place where {',
&gt;&gt;&gt; ' ?person a dbo:President.',
&gt;&gt;&gt; ' ?person dbo:birthPlace ?place.',
&gt;&gt;&gt; '}'
&gt;&gt;&gt; ))
&gt;&gt;&gt; for r in DBPedia().search(sparql, start=1, count=10):
&gt;&gt;&gt; print '%s (%s)' % (r.person.name, r.place.name)
Álvaro Arzú (Guatemala City)
Árpád Göncz (Budapest)
...</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="facebook"></a>Facebook posts, comments &amp; likes</h2>
<p><span class="inline_code">Facebook.search(query,</span> <span class="inline_code">type=SEARCH)</span> returns a list of <span class="inline_code">Result</span> objects, where each result is a (publicly available) post that contains (or which comments contain) the given query.</p>
<p><span class="inline_code">Facebook.search(id,</span> <span class="inline_code">type=NEWS)</span> returns posts from a given user profile. You need to supply a personal license key. You can get a key when you <a href="/pattern-facebook" target="_blank">authorize Pattern</a> to search Facebook in your name.</p>
<p><span class="inline_code">Facebook.search(id,</span> <span class="inline_code">type=COMMENTS)</span> retrieves comments for a given post's&nbsp;<span class="inline_code">Result.id</span>. You can also pass the id of a post or a comment to <span class="inline_code">Facebook.search(id, type=LIKES)</span> to retrieve users that liked it.</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import Facebook, NEWS, COMMENTS, LIKES
&gt;&gt;&gt;
&gt;&gt;&gt; fb = Facebook(license='your key')
&gt;&gt;&gt; me = fb.profile(id=None) # (id, name, date, gender, locale, likes)-tuple
&gt;&gt;&gt;
&gt;&gt;&gt; for post in fb.search(me[0], type=NEWS, count=100):
&gt;&gt;&gt; print repr(post.id)
&gt;&gt;&gt; print repr(post.text)
&gt;&gt;&gt; print repr(post.url)
&gt;&gt;&gt; if post.comments &gt; 0:
&gt;&gt;&gt; print '%i comments' % post.comments
&gt;&gt;&gt; print [(r.text, r.author) for r in fb.search(post.id, type=COMMENTS)]
&gt;&gt;&gt; if post.likes &gt; 0:
&gt;&gt;&gt; print '%i likes' % post.likes
&gt;&gt;&gt; print [r.author for r in fb.search(post.id, type=LIKES)]
u'530415277_10151455896030278'
u'Tom De Smedt likes CLiPS Research Center'
u'http://www.facebook.com/CLiPS.UA'
1 likes
[(u'485942414773810', u'CLiPS Research Center')]
.... </pre></div>
<p>The maximum <span class="inline_code">count</span> for <span class="inline_code">COMMENTS</span> and <span class="inline_code">LIKES</span> is 1000 (by default, 10).&nbsp;</p>
<p>&nbsp;</p>
<hr />
<h2>RSS + Atom newsfeeds</h2>
<p>The <span class="inline_code">Newsfeed</span> object is a wrapper for Mark Pilgrim's <a href="http://www.feedparser.org/" target="_blank">Universal Feed Parser</a>. <span class="inline_code">Newsfeed.search()</span> takes the URL of an RSS or Atom news feed and returns a list of <span class="inline_code">Result</span> objects.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Newsfeed
&gt;&gt;&gt;
&gt;&gt;&gt; NATURE = 'http://www.nature.com/nature/current_issue/rss/index.html'
&gt;&gt;&gt; for result in Newsfeed().search(NATURE)[:5]:
&gt;&gt;&gt; print repr(result.title)
u'Biopiracy rules should not block biological control'
u'Animal behaviour: Same-shaped shoals'
u'Genetics: Fast disease factor'
u'Biomimetics: Material monitors mugginess'
u'Cell biology: Lung lipid hurts breathing'
</pre></div>
<p><span class="inline_code">Newsfeed.search()</span> has an optional parameter <span class="inline_code">tags</span>, which is a list of custom tags to parse:</p>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; for result in Newsfeed().search(NATURE, tags=['dc:identifier']):
&gt;&gt;&gt; print result.dc_identifier</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="sort"></a>Web sort</h2>
<p>The return value of&nbsp;<span class="inline_code">SearchEngine.search()</span> has a <span class="inline_code">total</span> property which can be used to sort queries by "crowdvoting".&nbsp;The <span class="inline_code">sort()</span> function sorts a given list of terms according to their total result count, and returns a list of <span class="inline_code">(percentage,</span> <span class="inline_code">term)</span>-tuples.</p>
<pre class="brush:python; gutter:false; light:true;">sort(
terms = [], # List of search terms.
context = '', # Term used for sorting.
service = GOOGLE, # GOOGLE | BING | YAHOO | FLICKR
license = None, # Service license key.
strict = True, # Wrap query in quotes?
prefix = False, # context + term or term + context?
cached = True)</pre><p>When a <span class="inline_code">context</span> is defined, the function sorts by relevance to the context, e.g.,&nbsp;<span class="inline_code">sort(["black",</span> <span class="inline_code">"white"],</span> <span class="inline_code">context="Darth</span> <span class="inline_code">Vader")</span> yields <em>black</em> as the best candidate, because <span class="inline_code">"black</span> <span class="inline_code">Darth</span> <span class="inline_code">Vader"</span> is more common in search results.</p>
<p>Now let's see who is more dangerous:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import sort
&gt;&gt;&gt;
&gt;&gt;&gt; results = sort(terms=[
&gt;&gt;&gt; 'arnold schwarzenegger',
&gt;&gt;&gt; 'chuck norris',
&gt;&gt;&gt; 'dolph lundgren',
&gt;&gt;&gt; 'steven seagal',
&gt;&gt;&gt; 'sylvester stallone',
&gt;&gt;&gt; 'mickey mouse'], context='dangerous', prefix=True)
&gt;&gt;&gt;
&gt;&gt;&gt; for weight, term in results:
&gt;&gt;&gt; print "%.2f" % (weight * 100) + '%', term
84.34% 'dangerous mickey mouse'
9.24% 'dangerous chuck norris'
2.41% 'dangerous sylvester stallone'
2.01% 'dangerous arnold schwarzenegger'
1.61% 'dangerous steven seagal'
0.40% 'dangerous dolph lundgren'
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="plaintext"></a>HTML to plaintext</h2>
<p>The HTML source code of a web page can be retrieved with&nbsp;<span class="inline_code">URL.download()</span>. HTML is a markup language that uses <em>tags</em> to define text formatting.&nbsp;For example,&nbsp;<span class="inline_code">&lt;b&gt;hello&lt;/b&gt;</span> displays <strong>hello</strong> in bold. For many tasks we may want to strip the formatting so we can analyze (e.g., <a href="pattern-en.html#parser">parse</a> or <a href="pattern-vector.html#wordcount">count</a>) the plain text.</p>
<p>The <span class="inline_code">plaintext()</span> function removes HTML formatting from a string.</p>
<pre class="brush:python; gutter:false; light:true;">plaintext(html, keep=[], replace=blocks, linebreaks=2, indentation=False)</pre><p>It performs the following steps to clean up the given string:</p>
<ul>
<li><strong>Strip javascript:</strong> remove all <span class="inline_code">&lt;script&gt;</span> elements.</li>
<li><strong>Strip CSS: </strong>remove all <span class="inline_code">&lt;style&gt;</span> elements.</li>
<li><strong>Strip comments:</strong> remove all <span class="inline_code">&lt;!-- --&gt;</span> elements.</li>
<li><strong>Strip forms: </strong>remove all <span class="inline_code">&lt;form&gt;</span> elements.</li>
<li><strong>Strip tags: </strong>remove all HTML tags.</li>
<li><strong>Decode entities:</strong> replace <span class="inline_code">&amp;lt;</span> with <span class="inline_code">&lt;</span> (for example).</li>
<li><strong>Collapse spaces:</strong>&nbsp;replace consecutive spaces with a single space.</li>
<li><strong>Collapse linebreaks:</strong>&nbsp;replace consecutive linebreaks with a single linebreak.</li>
<li><strong>Collapse tabs:</strong>&nbsp;replace consecutive tabs with a single space, optionally indentation (i.e., tabs at the start of a line) can be preserved.</li>
</ul>
<p><span class="smallcaps">plaintext parameters</span></p>
<p>The <span class="inline_code">keep</span> parameter is a list of tags to retain. By default, attributes are stripped, e.g.,&nbsp;<span class="inline_code">&lt;table border="0"&gt;</span> becomes <span class="inline_code">&lt;table&gt;</span>. To preserve specific attributes, a dictionary can be given: <span class="inline_code">{"a":</span> <span class="inline_code">["href"]}</span>.</p>
<p>The <span class="inline_code">replace</span> parameter defines how HTML elements are replaced with other characters to improve plain text layout. It is a dictionary of <span class="inline_code">tag</span><span class="inline_code">(before,</span> <span class="inline_code">after)</span> items. By default, it&nbsp;replaces block elements (i.e., <span class="inline_code">&lt;h1&gt;</span>, <span class="inline_code"> </span><span class="inline_code">&lt;h2&gt;</span>, <span class="inline_code"> </span><span class="inline_code">&lt;p&gt;</span>, <span class="inline_code"> </span><span class="inline_code">&lt;div&gt;</span>, <span class="inline_code"> </span><span class="inline_code">&lt;table&gt;</span>, ...) with two linebreaks, <span class="inline_code">&lt;th&gt;</span> and <span class="inline_code">&lt;tr&gt;</span> with one linebreak, <span class="inline_code">&lt;td&gt;</span> with one tab, and&nbsp;<span class="inline_code">&lt;li&gt;</span> with an asterisk (<span class="inline_code">*</span>) before and a linebreak after.</p>
<p>The <span class="inline_code">linebreaks</span> parameter defines the maximum number of consecutive linebreaks to retain.</p>
<p>The <span class="inline_code">indentation</span> parameter defines whether or not to retain tab indentation.</p>
<p>The following example downloads a HTML document and keeps a minimal amount of formatting (headings, bold, links).</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import URL, plaintext
&gt;&gt;&gt;
&gt;&gt;&gt; s = URL('http://www.clips.ua.ac.be').download()
&gt;&gt;&gt; s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
&gt;&gt;&gt; print s
</pre></div>
<p style="margin-top: 1.3em;"><span class="smallcaps">plaintext = strip + decode + collapse</span></p>
<p>The different steps in <span class="inline_code">plaintext()</span> are available as separate functions:</p>
<pre class="brush:python; gutter:false; light:true;">decode_utf8(string) # Byte string to Unicode string.</pre><pre class="brush:python; gutter:false; light:true;">encode_utf8(string) # Unicode string to byte string.
</pre><pre class="brush:python; gutter:false; light:true;">strip_tags(html, keep=[], replace=blocks) # Non-trivial, using SGML parser.
</pre><pre class="brush:python; gutter:false; light:true;">strip_between(a, b, string) # Remove anything between (and including) a and b.
</pre><pre class="brush:python; gutter:false; light:true;">strip_javascript(html) # Strips between '&lt;script*&gt;' and '&lt;/script'.</pre><pre class="brush:python; gutter:false; light:true;">strip_inline_css(html) # Strips between '&lt;style*&gt;' and '&lt;/style&gt;'.</pre><pre class="brush:python; gutter:false; light:true;">strip_comments(html) # Strips between '&lt;!--' and '--&gt;'.</pre><pre class="brush:python; gutter:false; light:true;">strip_forms(html) # Strips between '&lt;form*&gt;' and '&lt;/form&gt;'.</pre><pre class="brush:python; gutter:false; light:true;">decode_entities(string) # '&amp;lt;' =&gt; '&lt;'</pre><pre class="brush:python; gutter:false; light:true;">encode_entities(string) # '&lt;' =&gt; '&amp;lt;' </pre><pre class="brush:python; gutter:false; light:true;">decode_url(string) # 'and%2For' =&gt; 'and/or'</pre><pre class="brush:python; gutter:false; light:true;">encode_url(string) # 'and/or' =&gt; 'and%2For' </pre><pre class="brush:python; gutter:false; light:true;">collapse_spaces(string, indentation=False, replace=' ')</pre><pre class="brush:python; gutter:false; light:true;">collapse_tabs(string, indentation=False, replace=' ')</pre><pre class="brush:python; gutter:false; light:true;">collapse_linebreaks(string, threshold=1)</pre><p>&nbsp;</p>
<hr />
<h2 class="example"><a name="DOM"></a>HTML DOM parser</h2>
<p>The Document Object Model (DOM) is a language-independent convention for representing HTML, XHTML and XML documents. The pattern.web module includes a HTML DOM parser (based on Leonard Richardson's <a href="http://www.crummy.com/software/BeautifulSoup/" target="_blank">BeautifulSoup</a>) that can be used to traverse a HTML document as a tree of linked Python objects. This is useful to extract specific portions from a HTML string retrieved with <span class="inline_code">URL.download()</span>.</p>
<h3>Node</h3>
<p>The DOM consists of a <span class="inline_code">DOM</span> object that contains <span class="inline_code">Text</span>, <span class="inline_code">Comment</span> and <span class="inline_code">Element</span> objects.<br />All of these are subclasses of <span class="inline_code">Node</span>.</p>
<pre class="brush:python; gutter:false; light:true;">node = Node(html, type=NODE)</pre><pre class="brush:python; gutter:false; light:true;">node.type # NODE | TEXT | COMMENT | ELEMENT | DOCUMENT
node.source # HTML source.
node.parent # Parent node.
node.children # List of child nodes.
node.next # Next child in node.parent (or None).
node.previous # Previous child in node.parent (or None).</pre><pre class="brush:python; gutter:false; light:true;">node.traverse(visit=lambda node: None)</pre><h3>Element</h3>
<p><span class="inline_code">Text</span>, <span class="inline_code">Comment</span> and <span class="inline_code">Element</span> are subclasses of <span class="inline_code">Node</span>. For example,&nbsp;<span class="inline_code">'the</span> <span class="inline_code">&lt;b&gt;cat&lt;/b&gt;'</span> is parsed to <span class="inline_code">Text('the')</span> + <span class="inline_code">Element('cat',</span> <span class="inline_code">tag='b')</span>. The <span class="inline_code">Element</span> object has a number of additional properties:</p>
<pre class="brush:python; gutter:false; light:true;">element = Element(html)</pre><pre class="brush:python; gutter:false; light:true;">element.tag # Tag name.
element.attrs # Dictionary of attributes, e.g. {'class':'comment'}.
element.id # Value for id attribute (or None).</pre><pre class="brush:python; gutter:false; light:true;">element.source # HTML source.
element.content # HTML source minus open and close tag.</pre><pre class="brush:python; gutter:false; light:true;">element.by_id(str) # First nested Element with given id.
element.by_tag(str) # List of nested Elements with given tag name.
element.by_class(str) # List of nested Elements with given class.
element.by_attr(**kwargs) # List of nested Elements with given attribute.
element(selector) # List of nested Elements matching a CSS selector.
</pre><ul>
<li><span class="inline_code">Element.by_tag()</span>&nbsp;can include a class (e.g.,&nbsp;<span class="inline_code">"div.header"</span>) or an id (e.g.,&nbsp;<span class="inline_code">"div#content"</span>). <br />A wildcard can be used to match any tag. (e.g. <span class="inline_code">"*.even"</span>).<br />The element is searched recursively (children in children, etc.)</li>
<li><span class="inline_code">Element.by_attr()</span> takes one or more keyword arguments (e.g.,&nbsp;<span class="inline_code">name="keywords"</span>).</li>
<li><span class="inline_code">Element(selector)</span> returns a list of nested elements that match the given <a href="http://www.w3.org/TR/CSS2/selector.html" target="_blank">CSS selector</a>:</li>
</ul>
<p>Overview of CSS selectors:</p>
<div>
<table class="border">
<tbody>
<tr>
<td class="smallcaps">CSS Selector</td>
<td class="smallcaps">Description</td>
</tr>
<tr>
<td class="inline_code">element('*')</td>
<td>all nested elements</td>
</tr>
<tr>
<td class="inline_code">element('*#x')</td>
<td>all nested elements with <span class="inline_code">id="x"</span></td>
</tr>
<tr>
<td class="inline_code">element('div#x')</td>
<td>all nested <span class="inline_code">&lt;div&gt;</span> elements with <span class="inline_code">id="x"</span></td>
</tr>
<tr>
<td class="inline_code">element('div.x')</td>
<td>all nested <span class="inline_code">&lt;div&gt;</span> elements with <span class="inline_code">class="x"</span></td>
</tr>
<tr>
<td class="inline_code">element('div[class="x"]')</td>
<td>all nested<span class="inline_code"> &lt;div&gt;</span> elements with attribute <span class="inline_code">"class"</span> = <span class="inline_code">"x"</span></td>
</tr>
<tr>
<td class="inline_code">element('div:first-child')</td>
<td>the first child in a <span class="inline_code">&lt;div&gt;</span></td>
</tr>
<tr>
<td class="inline_code">element('div a')</td>
<td>all nested <span class="inline_code">&lt;a&gt;</span>'s inside a nested <span class="inline_code">&lt;div&gt;</span></td>
</tr>
<tr>
<td class="inline_code">element('div, a')</td>
<td>all nested <span class="inline_code">&lt;a&gt;</span>'s and <span class="inline_code">&lt;div&gt;</span> elements</td>
</tr>
<tr>
<td class="inline_code">element('div + a')</td>
<td>all nested <span class="inline_code">&lt;a&gt;</span>'s directly preceded by a <span class="inline_code">&lt;div&gt;</span></td>
</tr>
<tr>
<td class="inline_code">element('div &gt; a')</td>
<td>all nested <span class="inline_code">&lt;a&gt;</span>'s directly inside a nested <span class="inline_code">&lt;div&gt;</span></td>
</tr>
<tr>
<td class="inline_code">element('div &lt; a')</td>
<td>all nested <span class="inline_code">&lt;div&gt;</span>'s directly containing an <span class="inline_code">&lt;a&gt;</span></td>
</tr>
</tbody>
</table>
</div>
<div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import Element
&gt;&gt;&gt;
&gt;&gt;&gt; div = Element('&lt;div&gt; &lt;a&gt;1st&lt;/a&gt; &lt;a&gt;2nd&lt;a&gt; &lt;/div&gt;')
&gt;&gt;&gt; print div('a:first-child')
&gt;&gt;&gt; print div('a:first-child')[0].source
[Element(tag='a')]
&lt;a&gt;1st&lt;/a&gt; </pre></div>
<h3>DOM</h3>
<p>The top-level element in the Document Object Model.</p>
<pre class="brush:python; gutter:false; light:true;">dom = DOM(html)</pre><pre class="brush:python; gutter:false; light:true;">dom.declaration # &lt;!doctype&gt; TEXT Node.
dom.head # &lt;head&gt; Element.
dom.body # &lt;body&gt; Element.</pre><p>The following example retrieves the most recent&nbsp;<a href="http://www.reddit.com/" target="_blank">reddit</a>&nbsp;entries. The pattern.web module does not include a reddit search engine, but we can parse entries directly from the HTML source. This is called <em>screen scraping</em>, and many websites will strongly dislike it.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import URL, DOM, plaintext
&gt;&gt;&gt;
&gt;&gt;&gt; url = URL('http://www.reddit.com/top/')
&gt;&gt;&gt; dom = DOM(url.download(cached=True))
&gt;&gt;&gt; for e in dom('div.entry')[:3]: # Top 3 reddit entries.
&gt;&gt;&gt; for a in e('a.title')[:1]: # First &lt;a class="title"&gt;.
&gt;&gt;&gt; print repr(plaintext(a.content))
u'Invisible Kitty'
u'Naturally, he said yes.'
u"I'd just like to remind everyone that /r/minecraft exists and not everyone wants"
"to have 10 Minecraft posts a day on their front page."</pre></div>
<p><span class="smallcaps"><br />Absolute URLs</span></p>
<p>Links parsed from the <span class="inline_code">DOM</span> can be relative (e.g., starting with <span class="inline_code">"../"</span> instead of <span class="inline_code">"http://"</span>).<br />To get the absolute URL, you can use the <span class="inline_code">abs()</span> function in combination with <span class="inline_code">URL.redirect</span>:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import URL, DOM, abs
&gt;&gt;&gt;
&gt;&gt;&gt; url = URL('http://www.clips.ua.ac.be')
&gt;&gt;&gt; dom = DOM(url.download())
&gt;&gt;&gt; for link in dom('a'):
&gt;&gt;&gt; print abs(link.attributes.get('href',''), base=url.redirect or url.string) </pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="pdf"></a>PDF Parser</h2>
<p style="margin-top: 0.2em; margin-right: 0px; margin-bottom: 0.5em; margin-left: 0px;">Portable Document Format (PDF) is a popular open standard, where text, fonts, images and layout are contained in a single document that displays the same across systems. However, extracting the source text from a PDF can be difficult.</p>
<p style="margin-top: 0.2em; margin-right: 0px; margin-bottom: 0.5em; margin-left: 0px;">The <span class="inline_code">PDF</span> object (based on <a href="http://www.unixuser.org/~euske/python/pdfminer/" target="_self">PDFMiner</a>) parses the source text from a PDF file.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import URL, PDF
&gt;&gt;&gt;
&gt;&gt;&gt; url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
&gt;&gt;&gt; pdf = PDF(url.download())
&gt;&gt;&gt; print pdf.string
CLiPS Technical Report series 002 September 7, 2010
Tom De Smedt, Vincent Van Asch, Walter Daelemans
Computational Linguistics &amp; Psycholinguistics Research Center
... </pre></div>
<p style="margin-top: 0.2em; margin-right: 0px; margin-bottom: 0.5em; margin-left: 0px;">URLs linking to a PDF document can be identified with: <span class="inline_code">URL.mimetype</span> <span class="inline_code">in</span> <span class="inline_code">MIMETYPE_PDF</span>.</p>
<p>&nbsp;</p>
<hr />
<h2><a name="crawler"></a>Crawler</h2>
<p>A web crawler or web spider can be used to traverse the web automatically. The <span class="inline_code">Crawler</span>&nbsp;object takes a list of URLs. These are then visited by the crawler. If they lead to a web page, the HTML content is parsed for new links. These are added to the list of links scheduled for a visit.</p>
<p>The given <span class="inline_code">domains</span> is a list of allowed domain names. An empty list means the crawler can visit the entire web. The given <span class="inline_code">delay</span> defines the number of seconds to wait before revisiting the same (sub)domain continually hammering one server with a robot disrupts requests from the website's regular visitors (this is called a <em>denial-of-service attack</em>).</p>
<pre class="brush:python; gutter:false; light:true;">crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)</pre><pre class="brush:python; gutter:false; light:true;">crawler.domains # Domains allowed to visit (e.g., ['clips.ua.ac.be']).
crawler.delay # Delay between visits to the same (sub)domain.
crawler.history # Dictionary of (domain, time last visited)-items.
crawler.visited # Dictionary of URLs visited.
crawler.sort # FIFO | LIFO (how new links are queued).
crawler.done # True when all links have been visited.</pre><pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">crawler.push(link, priority=1.0, sort=LIFO)
crawler.pop(remove=True)
crawler.next # Yields the next scheduled link = Crawler.pop(False)</pre><pre class="brush:python; gutter:false; light:true;">crawler.crawl(method=DEPTH) # DEPTH | BREADTH | None.</pre><pre class="brush:python; gutter:false; light:true;">crawler.priority(link, method=DEPTH)
crawler.follow(link)
crawler.visit(link, source=None)
crawler.fail(link)</pre><h3>Crawling process</h3>
<ul>
<li><span class="inline_code">Crawler.crawl()</span> is meant to be called continuously in a loop. It selects a link to visit and parses the HTML content for new links. The <span class="inline_code">method</span> parameter defines whether the crawler prefers internal links (<span class="inline_code">DEPTH</span>) or external links to other domains (<span class="inline_code">BREADTH</span>). If the link leads to a recently visited domain (i.e., elapsed time &lt; <span class="inline_code">Crawler.delay</span>) it is temporarily skipped. To disable this behaviour, use&nbsp;an optional <span class="inline_code">throttle</span>&nbsp;parameter &gt;=&nbsp;<span class="inline_code">Crawler.delay</span>.</li>
</ul>
<ul>
<li><span class="inline_code">Crawler.priority()</span> is called from <span class="inline_code">Crawler.crawl()</span> to determine the priority (<span class="inline_code">0.0</span>-<span class="inline_code">1.0</span>) of a new <span class="inline_code">Link</span>, where&nbsp;links with highest priority are visited first.&nbsp;It can be overridden in a subclass.&nbsp;</li>
</ul>
<ul>
<li><span class="inline_code">Crawler.follow()</span> is called from <span class="inline_code">Crawler.crawl()</span> to determine if it should schedule the given <span class="inline_code">Link</span>&nbsp;for a visit. By default it yields <span class="inline_code">True</span>. It can be overridden to disallow selected links.</li>
</ul>
<ul>
<li><span class="inline_code">Crawler.visit()</span> is called from <span class="inline_code">Crawler.crawl()</span> when a <span class="inline_code">Link</span> is visited. The given&nbsp;<span class="inline_code">source</span>&nbsp;is a HTML string with the page content. By default, this method does nothing (it should be overridden).</li>
</ul>
<ul>
<li><span class="inline_code">Crawler.fail()</span> is called from <span class="inline_code">Crawler.crawl()</span> for links whose MIME-type could not be determined, or which raise a <span class="inline_code">URLError</span> while downloading.</li>
</ul>
<p>The crawler uses <span class="inline_code">Link</span> objects internally, which contain additional information besides the URL string:</p>
<pre class="brush:python; gutter:false; light:true;">link = Link(url, text='', relation='')</pre><pre class="brush:python; gutter:false; light:true;">link.url # Parsed from &lt;a href=''&gt; attribute.
link.text # Parsed from &lt;a title=''&gt; attribute.
link.relation # Parsed from &lt;a rel=''&gt; attribute.
link.referrer # Parent web page URL.</pre><p>The following example shows a subclass of <span class="inline_code">Crawler</span> that prints each link it visits. Since it uses <span class="inline_code">DEPTH</span> for crawling, it will prefer internal links.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Crawler
&gt;&gt;&gt;
&gt;&gt;&gt; class Polly(Crawler):
&gt;&gt;&gt; def visit(self, link, source=None):
&gt;&gt;&gt; print 'visited:', repr(link.url), 'from:', link.referrer
&gt;&gt;&gt; def fail(self, link):
&gt;&gt;&gt; print 'failed:', repr(link.url)
&gt;&gt;&gt;
&gt;&gt;&gt; p = Polly(links=['http://www.clips.ua.ac.be/'], delay=3)
&gt;&gt;&gt; while not p.done:
&gt;&gt;&gt; p.crawl(method=DEPTH, cached=False, throttle=3)
visited: u'http://www.clips.ua.ac.be/'
visited: u'http://www.clips.ua.ac.be/#navigation'
visited: u'http://www.clips.ua.ac.be/colloquia'
visited: u'http://www.clips.ua.ac.be/computational-linguistics'
visited: u'http://www.clips.ua.ac.be/contact'
</pre></div>
<p><span class="small"><span style="text-decoration: underline;">Note</span>: <span class="inline_code">Crawler.crawl()</span> takes the same parameters as <span class="inline_code">URL.download()</span>, e.g., </span><span class="small"><span class="inline_code">cached=False</span> or <span class="inline_code">throttle=10</span>.<br /></span></p>
<h3>Crawl function</h3>
<p>The <span class="inline_code">crawl()</span> function returns an iterator&nbsp;that yields <span class="inline_code">(Link,</span> <span class="inline_code">source)</span>-tuples. When it is <em>idle</em> (e.g., waiting for the <span class="inline_code">delay</span> on a domain) it yields (<span class="inline_code">None</span>, <span class="inline_code">None</span>).</p>
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">crawl(
links = [],
domains = [],
delay = 20.0,
sort = FIFO,
method = DEPTH, **kwargs)</pre><div class="example">
<pre class="brush: python;gutter: false; light: true; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import crawl
&gt;&gt;&gt;
&gt;&gt;&gt; for link, source in crawl('http://www.clips.ua.ac.be/', delay=3, throttle=3):
&gt;&gt;&gt; print link
Link(url=u'http://www.clips.ua.ac.be/')
Link(url=u'http://www.clips.ua.ac.be/#navigation')
Link(url=u'http://www.clips.ua.ac.be/computational-linguistics')
...</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="mail"></a>E-mail</h2>
<p>The <span class="inline_code">Mail</span> object can be used to retrieve e-mail messages from Gmail, provided that IMAP is <a href="http://mail.google.com/support/bin/answer.py?answer=77695">enabled</a>.&nbsp;It may also work with other services, by passing the server address to the <span class="inline_code">service</span> parameter (e.g.,&nbsp;<span class="inline_code">service="imap.gmail.com"</span>).&nbsp;With <span class="inline_code">secure=False</span> (no SSL) the default <span class="inline_code">port</span> is 143.</p>
<pre class="brush:python; gutter:false; light:true;">mail = Mail(username, password, service=GMAIL, port=993, secure=True)</pre><pre class="brush:python; gutter:false; light:true;">mail.folders # Dictionary of (name, MailFolder)-items.
mail.[folder] # E.g., Mail.inbox.read(id)
mail.[folder].count # Number of messages in folder.
</pre><pre class="brush:python; gutter:false; light:true;">mail.[folder].search(query, field=FROM) # FROM | SUBJECT | DATE
mail.[folder].read(id, attachments=False, cached=True)</pre><ul>
<li><span class="inline_code">Mail.folders</span> is a <span class="inline_code">name</span><span class="inline_code">MailFolder</span> dictionary. Common names include&nbsp;<span class="inline_code">inbox</span>, <span class="inline_code">spam</span>&nbsp;and&nbsp;<span class="inline_code">trash</span>.</li>
<li><span class="inline_code">MailFolder.search()</span> returns a list of e-mail id's, most recent first.</li>
<li><span class="inline_code">MailFolder.read()</span> retrieves the e-mail with given id as a <span class="inline_code">Message</span>.</li>
</ul>
<div><span style="line-height: 18px;">A <span class="inline_code">Message</span> has the following properties:</span></div>
<pre class="brush:python; gutter:false; light:true;">message = Mail.[folder].read(i)</pre><pre class="brush:python; gutter:false; light:true;">message.author # Unicode string, sender name + e-mail address.
message.email_address # Unicode string, sender e-mail address.
message.date # Unicode string, date received.
message.subject # Unicode string, message subject.
message.body # Unicode string, message body.
message.attachments # List of (MIME-type, str)-tuples.
</pre><p>The following example retrieves spam e-mails containing the word "wish":</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Mail, GMAIL, SUBJECT
&gt;&gt;&gt;
&gt;&gt;&gt; gmail = Mail(username='...', password='...', service=GMAIL)
&gt;&gt;&gt; print gmail.folders.keys()
['drafts', 'spam', 'personal', 'work', 'inbox', 'mail', 'starred', 'trash']</pre></div>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; i = gmail.spam.search('wish', field=SUBJECT)[0] # What riches await...
&gt;&gt;&gt; m = gmail.spam.read(i)
&gt;&gt;&gt; print ' From:', m.author
&gt;&gt;&gt; print 'Subject:', m.subject
&gt;&gt;&gt; print 'Message:'
&gt;&gt;&gt; print m.body
From: u'Vegas VIP Clib &lt;amllhbmjb@acciongeoda.org&gt;'
Subject: u'Your wish has been granted'
Message: u'No one has claimed our jackpot! This is your chance to try!'
</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="locale"></a>Locale</h2>
<p>The pattern.web.locale module&nbsp;contains functions for region and language codes, based on the ISO-639 language code (e.g., <span class="inline_code">en</span>), the ISO-3166 region code (e.g., <span class="inline_code">US</span>) and the IETF BCP 47 language-region specification (<span class="inline_code">en-US</span>):</p>
<pre class="brush:python; gutter:false; light:true;">encode_language(name) # 'English' =&gt; 'en'</pre><pre class="brush:python; gutter:false; light:true;">decode_language(code) # 'en' =&gt; 'English'</pre><pre class="brush:python; gutter:false; light:true;">encode_region(name) # 'United States' =&gt; 'US'</pre><pre class="brush:python; gutter:false; light:true;">decode_region(code) # 'US' =&gt; 'United States'</pre><pre class="brush:python; gutter:false; light:true;">languages(region) # 'US' =&gt; ['en']</pre><pre class="brush:python; gutter:false; light:true;">regions(language) # 'en' =&gt; ['AU', 'BZ', 'CA', ...]</pre><pre class="brush:python; gutter:false; light:true;">regionalize(language) # 'en' =&gt; ['en-US', 'en-AU', ...]</pre><pre class="brush:python; gutter:false; light:true;">market(language) # 'en' =&gt; 'en-US'</pre><p>The <span class="inline_code">geocode()</span> function recognizes a number of world capital cities and returns a tuple (<span class="inline_code">latitude</span>, <span class="inline_code">longitude</span>, <span class="inline_code">ISO-639</span>, <span class="inline_code">region</span>).</p>
<pre class="brush:python; gutter:false; light:true;">geocode(location) # 'Brussels' =&gt; (50.83, 4.33, u'nl', u'Belgium')</pre><p>This is useful in combination with the <span class="inline_code">geo</span> parameter for <span class="inline_code">Twitter.search()</span> to obtain regional tweets:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Twitter
&gt;&gt;&gt; from pattern.web.locale import geocode
&gt;&gt;&gt;
&gt;&gt;&gt; twitter = Twitter(language='en')
&gt;&gt;&gt; for tweet in twitter.search('restaurant', geo=geocode('Brussels')[:2]):
&gt;&gt;&gt; print tweet.text
u'Did you know: every McDonalds restaurant has free internet in Belgium...'</pre></div>
<p>&nbsp;</p>
<hr />
<h2><a name="cache"></a>Cache</h2>
<p>By, default, <span class="inline_code">URL.download()</span> and <span class="inline_code">SearchEngine.search()</span> will cache results locally. Once the results of a query have been cached, there is no need to connect to the internet (i.e., the query runs faster).&nbsp;Over time the cache can grow quite large, filling up with whatever was downloaded from tweets to zip archives.</p>
<p>To empty the cache:</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import cache
&gt;&gt;&gt; cache.clear()
</pre></div>
<p>&nbsp;</p>
<hr />
<h2>See also</h2>
<ul>
<li><a href="http://www.crummy.com/software/BeautifulSoup/" target="_blank">BeautifulSoup</a> (BSD): r<span>obust HTML parser for Python.</span></li>
<li><span><a href="http://scrapy.org/" target="_blank">Scrapy</a> (BSD): s</span><span>creen scraping and web crawling with Python.</span></li>
</ul>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,397 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>pattern</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/pattern" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/pattern</a></div>
<h1>pattern</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1350" class="node node-type-page"><div class="node-inner">
<div class="content">
<p><span class="big">Pattern is a web mining module for the Python programming language.</span></p>
<p><span class="big">It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning (vector space model, clustering, SVM), network analysis and &lt;canvas&gt; visualization.</span></p>
<p>The module is free, well-document and bundled with 50+ examples and 350+ unit tests.</p>
<p><img src="../g/pattern_schema.gif" alt="" width="620" height="180" /></p>
<hr />
<h2>Download</h2>
<table>
<tbody>
<tr>
<td><a onclick="javascript:_gaq.push(['_trackPageview', '/downloads/pattern']);" href="http://www.clips.ua.ac.be/media/pattern-2.6.zip" target="_self"><img src="../g/download.gif" alt="download" align="left" /></a></td>
<td><strong>Pattern 2.6</strong>&nbsp;| <a onclick="javascript:_gaq.push(['_trackPageview', '/downloads/pattern']);" href="http://www.clips.ua.ac.be/media/pattern-2.6.zip" target="_self">download</a> (.zip, 23MB)<br />
<ul>
<li>Requires: Python 2.5+ on Windows | Mac | Linux</li>
<li>Licensed under <a href="http://www.linfo.org/bsdlicense.html" target="_blank">BSD</a></li>
<li>Latest releases: <a class="noexternal" href="http://www.clips.ua.ac.be/media/pattern-2.6.zip">2.6</a> |&nbsp;<a class="noexternal" href="http://www.clips.ua.ac.be/media/pattern-2.5.zip">2.5</a> |&nbsp;<a class="noexternal" href="http://www.clips.ua.ac.be/media/pattern-2.4.zip">2.4</a> | <a class="noexternal" href="http://www.clips.ua.ac.be/media/pattern-2.3.zip">2.3</a>&nbsp;|&nbsp;<a class="noexternal" href="http://www.clips.ua.ac.be/media/pattern-2.2.zip">2.2</a> |&nbsp;<a class="noexternal" href="http://www.clips.ua.ac.be/media/pattern-2.1.zip">2.1</a> |&nbsp;<a class="noexternal" href="http://www.clips.ua.ac.be/media/pattern-2.0.zip">2.0</a></li>
<li>Authors:<br />&nbsp;Tom De Smedt (<em>tom at organisms.be</em>)<br />&nbsp;Walter Daelemans&nbsp;</li>
</ul>
<p><span class="small"><span style="text-decoration: underline;">Reference</span>: De Smedt, T. &amp; Daelemans, W. (2012)</span>.<br /><span class="small">Pattern for Python. <em>Journal of Machine Learning Research</em>, 13: 20312035.</span></p>
<p id="checksum" class="grey"><span class="small"><span style="text-decoration: underline;">SHA256</span> checksum&nbsp;of the .zip:<br />28213f05d94a86d2de1d8a03525d456a9e68dc3b563dc2481ad08fe3db180d02</span></p>
</td>
<td>
</td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<hr />
<table border="0">
<tbody>
<tr>
<td style="width: 200px;">
<h2>Modules</h2>
<ul>
<li><a href="pattern-web.html">pattern.web</a></li>
<li><a href="pattern-db.html">pattern.db</a></li>
<li><a href="pattern-en.html">pattern.en</a>&nbsp;|&nbsp;<a href="pattern-es.html">es</a>&nbsp;| <a href="pattern-de.html">de</a> | <a href="pattern-fr.html">fr</a> | <a href="pattern-it.html">it</a> |&nbsp;<a href="pattern-nl.html">nl</a></li>
<li><a href="pattern-search.html">pattern.search</a></li>
<li><a href="pattern-vector.html">pattern.vector</a></li>
<li><a href="pattern-graph.html">pattern.graph</a>&nbsp;</li>
</ul>
<p><span class="smallcaps">Helper modules</span></p>
<ul style="margin-top: 0;">
<li><a href="pattern-metrics.html">pattern.metrics</a></li>
<li><a href="pattern-canvas.html">canvas.js</a></li>
</ul>
<p><span class="smallcaps">Command-line</span></p>
<ul style="margin-top: 0;">
<li><a href="pattern-shell.html">Command-line interface</a></li>
</ul>
</td>
<td>
<h2><a name="contribute"></a>Contribute</h2>
<ul>
<li><a href="pattern-dev.html">Developer documentation</a></li>
<li><a href="https://github.com/clips/pattern" target="_blank">GitHub repository</a></li>
<li><a href="http://groups.google.com/group/pattern-for-python" target="_blank">Google group</a></li>
</ul>
<form action="https://www.paypal.com/cgi-bin/webscr" method="post"><input type="hidden" name="cmd" value="_s-xclick" /> <input type="hidden" name="hosted_button_id" value="HW2GU5PNWYQV8" /> <input type="image" name="submit" src="../g/paypal-donate.jpg" alt="PayPal - The safer, easier way to pay online!" /> <img src="https://www.paypalobjects.com/en_US/i/scr/pixel.gif" alt="" width="1" height="1" border="0" /></form>
</td>
</tr>
</tbody>
</table>
<p>&nbsp;</p>
<hr />
<h2>Installation</h2>
<p>Pattern is written for Python 2.5+ (also supports Python 3.6+). The module has no external dependencies, except <span class="inline_code">LSA</span> in the pattern.vector module, which requires <a href="http://numpy.scipy.org/" target="_blank">NumPy</a> (installed by default on Mac OS X).&nbsp;</p>
<p>To install Pattern so that the module is available in all Python scripts, from the command line do:</p>
<div class="install">
<pre class="gutter:false; light:true;">&gt; cd pattern-3.6
&gt; python setup.py install&nbsp;</pre></div>
<p>If you have pip, you can automatically download and install from the PyPi repository:</p>
<div class="install">
<pre class="gutter:false; light:true;">&gt; pip install pattern</pre></div>
<p>If none of the above works, you can make Python aware of the module in three ways:</p>
<ul>
<li>Put the <span class="inline_code">pattern</span>&nbsp;subfolder in the .zip archive in the same folder as your script.</li>
<li>Put the <span class="inline_code">pattern</span>&nbsp;subfolder in the standard location for modules so it is available to all scripts:<br /><span class="inline_code">c:\python27\Lib\site-packages\</span>&nbsp;(Windows),<br /><span class="inline_code"> /Library/Python/2.7/site-packages/</span>&nbsp;(Mac),<br /><span class="inline_code">/usr/lib/python2.7/site-packages/</span>&nbsp;(Unix).<span style="font-family: Courier, monospace; font-size: small;"><span style="font-size: 12px;"><br /></span></span></li>
<li>Add the location of the module to&nbsp;<span class="inline_code">sys.path</span>&nbsp;in your Python script, before importing it:</li>
</ul>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; import sys; sys.path.append('/users/tom/desktop/pattern')
&gt;&gt;&gt; from pattern.web import Twitter </pre></div>
<p>&nbsp;</p>
<hr />
<h2>Quick overview</h2>
<h3>pattern.web</h3>
<p>The&nbsp;<a href="pattern-web.html">pattern.web</a>&nbsp;module is a web toolkit that contains API's (Google, Gmail, Bing, Twitter, Facebook, Wikipedia, Wiktionary, DBPedia, Flickr, ...), a robust HTML DOM parser and a web crawler.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Twitter, plaintext
&gt;&gt;&gt;
&gt;&gt;&gt; twitter = Twitter(language='en')
&gt;&gt;&gt; for tweet in twitter.search('"more important than"', cached=False):
&gt;&gt;&gt; print plaintext(tweet.text)
'The mobile web is more important than mobile apps.'
'Start slowly, direction is more important than speed.'
'Imagination is more important than knowledge. - Albert Einstein'
... </pre></div>
<h3>pattern.en</h3>
<p>The&nbsp;<a href="pattern-en.html">pattern.en</a>&nbsp;module is a natural language processing (NLP) toolkit for English. Because language is ambiguous (e.g., <em>I can</em>&nbsp;<em>a can</em>) it uses statistical approaches + regular expressions. This means that it is fast, quite accurate and occasionally incorrect. It has a part-of-speech tagger that identifies word types (e.g., noun, verb, adjective), word inflection (conjugation, singularization) and a WordNet API.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import parse
&gt;&gt;&gt;
&gt;&gt;&gt; s = 'The mobile web is more important than mobile apps.'
&gt;&gt;&gt; s = parse(s, relations=True, lemmata=True)
&gt;&gt;&gt; print s
'The/DT/B-NP/O/NP-SBJ-1/the mobile/JJ/I-NP/O/NP-SBJ-1/mobile' ...
</pre></div>
<table class="border">
<tbody>
<tr>
<td class="smallcaps" style="text-align: right;">word</td>
<td class="smallcaps" style="text-align: center;">tag</td>
<td class="smallcaps" style="text-align: center;">chunk</td>
<td class="smallcaps" style="text-align: center;">role</td>
<td class="smallcaps" style="text-align: center;">id</td>
<td class="smallcaps" style="text-align: center;">pnp</td>
<td class="smallcaps">lemma</td>
</tr>
<tr>
<td style="text-align: right;">The</td>
<td class="inline_code" style="text-align: center;">DT</td>
<td class="inline_code" style="text-align: center;">NP&nbsp;</td>
<td class="inline_code" style="text-align: center;">SBJ</td>
<td class="inline_code" style="text-align: center;">1</td>
<td class="inline_code" style="text-align: center;">-</td>
<td><em>the</em></td>
</tr>
<tr>
<td style="text-align: right;">mobile</td>
<td class="inline_code" style="text-align: center;">JJ</td>
<td class="inline_code" style="text-align: center;">NP^</td>
<td class="inline_code" style="text-align: center;">SBJ</td>
<td class="inline_code" style="text-align: center;">1</td>
<td class="inline_code" style="text-align: center;">-</td>
<td><em>mobile</em></td>
</tr>
<tr>
<td style="text-align: right;">web</td>
<td class="inline_code" style="text-align: center;">NN</td>
<td class="inline_code" style="text-align: center;">NP^</td>
<td class="inline_code" style="text-align: center;">SBJ</td>
<td class="inline_code" style="text-align: center;">1</td>
<td class="inline_code" style="text-align: center;">-</td>
<td><em>web</em></td>
</tr>
<tr>
<td style="text-align: right;">is</td>
<td class="inline_code" style="text-align: center;">VBZ</td>
<td class="inline_code" style="text-align: center;">VP&nbsp;</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">1</td>
<td class="inline_code" style="text-align: center;">-</td>
<td><em>be</em></td>
</tr>
<tr>
<td style="text-align: right;">more</td>
<td class="inline_code" style="text-align: center;">RBR</td>
<td class="inline_code" style="text-align: center;">ADJP&nbsp;</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td><em>more</em></td>
</tr>
<tr>
<td style="text-align: right;">important</td>
<td class="inline_code" style="text-align: center;">JJ</td>
<td class="inline_code" style="text-align: center;">ADJP^</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td><em>important</em></td>
</tr>
<tr>
<td style="text-align: right;">than</td>
<td class="inline_code" style="text-align: center;">IN</td>
<td class="inline_code" style="text-align: center;">PP&nbsp;</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">PNP</td>
<td><em>than</em></td>
</tr>
<tr>
<td style="text-align: right;">mobile</td>
<td class="inline_code" style="text-align: center;">JJ</td>
<td class="inline_code" style="text-align: center;">NP&nbsp;</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">PNP</td>
<td><em>mobile</em></td>
</tr>
<tr>
<td style="text-align: right;">apps</td>
<td class="inline_code" style="text-align: center;">NNS</td>
<td class="inline_code" style="text-align: center;">NP^</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">PNP</td>
<td><em>app</em></td>
</tr>
<tr>
<td style="text-align: right;">.</td>
<td class="inline_code" style="text-align: center;">.</td>
<td class="inline_code" style="text-align: center;">-&nbsp;</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td class="inline_code" style="text-align: center;">-</td>
<td>.</td>
</tr>
</tbody>
</table>
<p>The text has been annotated with word types,&nbsp;for example nouns (<span class="postag">NN</span>), verbs(<span class="postag">VB</span>),&nbsp;adjectives (<span class="postag">JJ</span>) and determiners (<span class="postag">DT</span>), word types (e.g.,&nbsp;sentence subject&nbsp;<span class="postag">SBJ</span>) and prepositional noun phrases (<span class="postag">PNP</span>). To iterate over the parts in the tagged text we can construct a <em>parse tree</em>.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt;
&gt;&gt;&gt; s = 'The mobile web is more important than mobile apps.'
&gt;&gt;&gt; s = parsetree(s)
&gt;&gt;&gt; for sentence in s:
&gt;&gt;&gt; for chunk in sentence.chunks:
&gt;&gt;&gt; for word in chunk.words:
&gt;&gt;&gt; print word,
&gt;&gt;&gt; print
Word(u'The/DT') Word(u'mobile/JJ') Word(u'web/NN')
Word(u'is/VBZ')
Word(u'more/RBR') Word(u'important/JJ')
Word(u'than/IN')
Word(u'mobile/JJ') Word(u'apps/NNS')
</pre></div>
<p>Parsers for Spanish, French, Italian, German and Dutch are also available: <br /><a href="pattern-es.html">pattern.es</a>&nbsp;| <a href="pattern-fr.html">pattern.fr</a> | <a href="pattern-it.html">pattern.it</a> |&nbsp;<a href="pattern-de.html">pattern.de</a>&nbsp;|&nbsp;<a href="pattern-nl.html">pattern.nl</a></p>
<h3>pattern.search</h3>
<p>The&nbsp;<a href="pattern-search.html">pattern.search</a>&nbsp;module contains a search algorithm to retrieve sequences of words (called <em>n-grams</em>) from tagged text.</p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt; from pattern.search import search
&gt;&gt;&gt;
&gt;&gt;&gt; s = 'The mobile web is more important than mobile apps.'
&gt;&gt;&gt; s = parsetree(s, relations=True, lemmata=True)
&gt;&gt;&gt;
&gt;&gt;&gt; for match in search('NP be RB?+ important than NP', s):
&gt;&gt;&gt; print match.constituents()[-1], '=&gt;', \
&gt;&gt;&gt; match.constituents()[0]
Chunk('mobile apps/NP') =&gt; Chunk('The mobile web/NP-SBJ-1')
</pre></div>
<p>The search pattern&nbsp;<span class="inline_code">NP</span> <span class="inline_code">be</span> <span class="inline_code">RB?+</span> <span class="inline_code">important</span> <span class="inline_code">than</span> <span class="inline_code">NP</span> means any noun phrase (<span class="postag">NP</span>) followed by the verb <em>to be</em>, followed by zero or more adverbs (<span class="postag">RB</span>, e.g.,&nbsp;<em>much</em>, <em>more</em>), followed by the words <em>important than</em>, followed by any noun phrase. It will also match "<em>The mobile web <span style="text-decoration: underline;">will</span> <span style="text-decoration: underline;">be</span> <span style="text-decoration: underline;">much</span> <span style="text-decoration: underline;">less</span> important than mobile apps</em>" and other grammatical variations.</p>
<h3>pattern.vector</h3>
<p>The&nbsp;<a href="pattern-vector.html">pattern.vector</a>&nbsp;module is a toolkit for machine learning, based on a vector space model&nbsp;of bag-of-words documents with weighted features (e.g., tf-idf) and distance metrics (e.g., cosine similarity, infogain).&nbsp;Models can be used for clustering (<em>k</em>-means, hierarchical), classification (Naive Bayes, Perceptron,&nbsp;<em>k-</em>NN, SVM) and latent semantic analysis (LSA).</p>
<div>
<div class="example">
<pre class="brush: python;gutter: false; fontsize: 100; first-line: 1; ">&gt;&gt;&gt; from pattern.web import Twitter
&gt;&gt;&gt; from pattern.en import tag
&gt;&gt;&gt; from pattern.vector import KNN, count
&gt;&gt;&gt;
&gt;&gt;&gt; twitter, knn = Twitter(), KNN()
&gt;&gt;&gt;
&gt;&gt;&gt; for i in range(1, 10):
&gt;&gt;&gt; for tweet in twitter.search('#win OR #fail', start=i, count=100):
&gt;&gt;&gt; s = tweet.text.lower()
&gt;&gt;&gt; p = '#win' in s and 'WIN' or 'FAIL'
&gt;&gt;&gt; v = tag(s)
&gt;&gt;&gt; v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
&gt;&gt;&gt; v = count(v)
&gt;&gt;&gt; if v:
&gt;&gt;&gt; knn.train(v, type=p)
&gt;&gt;&gt;
&gt;&gt;&gt; print knn.classify('sweet potato burger')
&gt;&gt;&gt; print knn.classify('stupid autocorrect')
'WIN'
'FAIL' </pre></div>
</div>
<p>This example trains a classifier on adjectives mined from Twitter. First, tweets with hashtag #win or #fail are mined. For example: <em>"$20 tip off a <span style="text-decoration: underline;">sweet</span> <span style="text-decoration: underline;">little</span> <span style="text-decoration: underline;">old</span> lady today #win"</em>. The word part-of-speech tags are parsed, keeping only adjectives. Each tweet is transformed to a vector, a dictionary of adjective → count items, labeled <span class="inline_code">WIN</span> or <span class="inline_code">FAIL</span>. The classifier uses the vectors to learn which other, unknown tweets look more like&nbsp;<span class="inline_code">WIN</span>&nbsp;(e.g., <em>sweet potato burger</em>) or more like <span class="inline_code">FAIL</span> (e.g., <em>stupid autocorrect</em>).</p>
<h3>pattern.graph</h3>
<p>The&nbsp;<a href="pattern-graph.html">pattern.graph</a>&nbsp;module provides a graph data structure that represents relations between nodes (e.g., terms, concepts). Graphs can be exported as HTML <span class="inline_code">&lt;canvas&gt;</span> animations (<span class="link-maintenance"><a href="http://www.clips.ua.ac.be/media/pattern-graph" target="_blank">demo</a></span>). In the example below, more <em>central</em> nodes (= more incoming traffic) are colored in blue.</p>
<p><img class="border" src="../g/pattern_graph5.jpg" alt="" width="610" height="198" /></p>
<div class="example">
<pre class="brush:python; gutter:false; light:true;">&gt;&gt;&gt; from pattern.web import Bing, plaintext
&gt;&gt;&gt; from pattern.en import parsetree
&gt;&gt;&gt; from pattern.search import search
&gt;&gt;&gt; from pattern.graph import Graph
&gt;&gt;&gt;
&gt;&gt;&gt; g = Graph()
&gt;&gt;&gt; for i in range(10):
&gt;&gt;&gt; for result in Bing().search('"more important than"', start=i+1, count=50):
&gt;&gt;&gt; s = r.text.lower()
&gt;&gt;&gt; s = plaintext(s)
&gt;&gt;&gt; s = parsetree(s)
&gt;&gt;&gt; p = '{NP} (VP) more important than {NP}'
&gt;&gt;&gt; for m in search(p, s):
&gt;&gt;&gt; x = m.group(1).string # NP left
&gt;&gt;&gt; y = m.group(2).string # NP right
&gt;&gt;&gt; if x not in g:
&gt;&gt;&gt; g.add_node(x)
&gt;&gt;&gt; if y not in g:
&gt;&gt;&gt; g.add_node(y)
&gt;&gt;&gt; g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A
&gt;&gt;&gt;
&gt;&gt;&gt; g = g.split()[0] # Largest subgraph.
&gt;&gt;&gt;
&gt;&gt;&gt; for n in g.sorted()[:40]: # Sort by Node.weight.
&gt;&gt;&gt; n.fill = (0, 0.5, 1, 0.75 * n.weight)
&gt;&gt;&gt;
&gt;&gt;&gt; g.export('test', directed=True, weighted=0.6) </pre></div>
<p>Some relations (= edges) could use some extra post-processing, e.g., in <em>nothing is more important than life</em>, <em>nothing</em> is <span style="text-decoration: underline;">not</span> more important than <em>life</em>.</p>
<p>&nbsp;</p>
<hr />
<h2>Case studies&nbsp;</h2>
<p>Case studies with hands-on source code examples.</p>
<table border="0">
<tbody>
<tr>
<td>
<p><a href="http://www.clips.ua.ac.be/pages/modeling-creativity-with-a-semantic-network-of-common-sense"><img src="../g/pattern_example_semantic_network.jpg" alt="" width="70" height="70" /><br /></a></p>
</td>
<td>&nbsp;</td>
<td><span class="smallcaps">modeling creativity with a semantic network of common sense </span><span class="small">(2013)</span>&nbsp;<br />This case study offers a computational model of creativity, by representing the mind as a semantic network of common sense, using <a class="link-maintenance" href="pattern-graph.html">pattern.graph</a>&nbsp;&amp; <a class="link-maintenance" href="pattern-web.html">web</a>.<br /><a href="http://www.clips.ua.ac.be/pages/modeling-creativity-with-a-semantic-network-of-common-sense">read more »</a></td>
</tr>
<tr>
<td>
<p><a class="noexternal" href="http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger"><img src="../g/pattern_example_italian.jpg" alt="" width="70" height="70" /><br /></a></p>
</td>
<td>&nbsp;</td>
<td><span class="smallcaps">using wiktionary to build an italian part-of-speech tagger </span><span class="small">(2013)</span> <br />This case study demonstrates how a part-of-speech tagger for Italian (see <a class="link-maintenance" href="pattern-it.html">pattern.it</a>) can be built by mining Wiktionary and Wikipedia. &nbsp;<br /><a href="http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger">read more »</a></td>
</tr>
<tr>
<td>
<p><a class="noexternal" href="http://www.clips.ua.ac.be/pages/using-wikicorpus-nltk-to-build-a-spanish-part-of-speech-tagger"><img src="../g/pattern_example_spanish.jpg" alt="" width="70" height="70" /><br /></a></p>
</td>
<td>&nbsp;</td>
<td><span class="smallcaps">using wikicorpus and nltk to build a spanish part-of-speech tagger </span><span class="small">(2012)</span><br />This case study demonstrates how a part-of-speech tagger for Spanish (see <a class="link-maintenance" href="pattern-es.html">pattern.es</a>) can be built by using NLTK and the freely available Wikicorpus. <br /><a href="http://www.clips.ua.ac.be/pages/using-wikicorpus-nltk-to-build-a-spanish-part-of-speech-tagger">read more »</a></td>
</tr>
<tr>
<td>
<p><a class="noexternal" href="http://www.clips.ua.ac.be/pages/pattern-examples-elections"><img src="../g/pattern_example_elections.jpg" alt="" width="70" height="70" /><br /></a></p>
</td>
<td>&nbsp;</td>
<td><span class="smallcaps">belgian elections</span><span class="smallcaps">, twitter sentiment analysis&nbsp;</span><span class="small">(2010)</span><br />This case study uses sentiment analysis (e.g., positive or negative tone) on 7,500 Dutch and French tweets (see <a class="link-maintenance" href="pattern-web.html">pattern.web</a> |&nbsp;<a class="link-maintenance" href="pattern-nl.html">nl</a>&nbsp;|&nbsp;<a class="link-maintenance" href="pattern-fr.html">fr</a>) in the weeks before the Belgian 2010 elections. <br /><a href="http://www.clips.ua.ac.be/pages/pattern-examples-elections">read more »</a></td>
</tr>
<tr>
<td>
<p><a class="noexternal" href="http://www.clips.ua.ac.be/pages/pattern-examples-100days"><img src="../g/pattern_example_100days.jpg" alt="" width="70" height="70" /><br /></a></p>
</td>
<td>&nbsp;</td>
<td><span class="smallcaps">web mining and visualization </span><span class="small">(2010)</span><br />This case study uses a number of different approaches to mine, correlate and visualize about 6,000 Google News items and 70,000 tweets.&nbsp;<br /><a href="http://www.clips.ua.ac.be/pages/pattern-examples-100days">read more »</a></td>
</tr>
</tbody>
</table>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1,54 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>stop-words</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" href="../clips.css" />
<style>
/* Small fixes because we omit the online layout.css. */
h3 { line-height: 1.3em; }
#page { margin-left: auto; margin-right: auto; }
#header, #header-inner { height: 175px; }
#header { border-bottom: 1px solid #C6D4DD; }
table { border-collapse: collapse; }
#checksum { display: none; }
</style>
<link href="../js/shCore.css" rel="stylesheet" type="text/css" />
<link href="../js/shThemeDefault.css" rel="stylesheet" type="text/css" />
<script language="javascript" src="../js/shCore.js"></script>
<script language="javascript" src="../js/shBrushXml.js"></script>
<script language="javascript" src="../js/shBrushJScript.js"></script>
<script language="javascript" src="../js/shBrushPython.js"></script>
</head>
<body class="node-type-page one-sidebar sidebar-right section-pages">
<div id="page">
<div id="page-inner">
<div id="header"><div id="header-inner"></div></div>
<div id="content">
<div id="content-inner">
<div class="node node-type-page"
<div class="node-inner">
<div class="breadcrumb">View online at: <a href="http://www.clips.ua.ac.be/pages/stop-words" class="noexternal" target="_blank">http://www.clips.ua.ac.be/pages/stop-words</a></div>
<h1>Stop words</h1>
<!-- Parsed from the online documentation. -->
<div id="node-1378" class="node node-type-page"><div class="node-inner">
<div class="content">
<p>Stop words are words that are so common that they are often filtered out prior to, or after, processing of natural language data (text). For example, the <a href="pattern-vector.html">pattern.vector</a> module (by default) ignores stop words when constructing bag-of-words.</p>
<p>There is no definitive list of stop words. The following set is based on <a href="http://snowball.tartarus.org/algorithms/english/stop.txt" target="_blank">Martin Porter's list</a> and expanded with words that occur frequently in other lists:</p>
<p>&nbsp;</p>
<hr />
<p>&nbsp;</p>
<p><em>a, aboard, about, above, across, after, again, against, all, almost, alone, along, alongside, already, also, although, always, am, amid, amidst, among, amongst, an, and, another, anti, any, anybody, anyone, anything, anywhere, are, area, areas, aren't, around, as, ask, asked, asking, asks, astride, at, aught, away, back, backed, backing, backs, bar, barring, be, became, because, become, becomes, been, before, began, behind, being, beings, below, beneath, beside, besides, best, better, between, beyond, big, both, but, by, came, can, can't, cannot, case, cases, certain, certainly, circa, clear, clearly, come, concerning, considering, could, couldn't, daren't, despite, did, didn't, differ, different, differently, do, does, doesn't, doing, don't, done, down, down, downed, downing, downs, during, each, early, either, end, ended, ending, ends, enough, even, evenly, ever, every, everybody, everyone, everything, everywhere, except, excepting, excluding, face, faces, fact, facts, far, felt, few, fewer, find, finds, first, five, following, for, four, from, full, fully, further, furthered, furthering, furthers, gave, general, generally, get, gets, give, given, gives, go, goes, going, good, goods, got, great, greater, greatest, group, grouped, grouping, groups, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, high, high, high, higher, highest, him, himself, his, hisself, how, how's, however, i, i'd, i'll, i'm, i've, idem, if, ilk, important, in, including, inside, interest, interested, interesting, interests, into, is, isn't, it, it's, its, itself, just, keep, keeps, kind, knew, know, known, knows, large, largely, last, later, latest, least, less, let, let's, lets, like, likely, long, longer, longest, made, make, making, man, many, may, me, member, members, men, might, mightn't, mine, minus, more, most, mostly, mr, mrs, much, must, mustn't, my, myself, naught, near, necessary, need, needed, needing, needn't, needs, neither, never, new, new, newer, newest, next, no, nobody, non, none, noone, nor, not, nothing, notwithstanding, now, nowhere, number, numbers, of, off, often, old, older, oldest, on, once, one, oneself, only, onto, open, opened, opening, opens, opposite, or, order, ordered, ordering, orders, other, others, otherwise, ought, oughtn't, our, ours, ourself, ourselves, out, outside, over, own, part, parted, parting, parts, past, pending, per, perhaps, place, places, plus, point, pointed, pointing, points, possible, present, presented, presenting, presents, problem, problems, put, puts, quite, rather, really, regarding, right, right, room, rooms, round, said, same, save, saw, say, says, second, seconds, see, seem, seemed, seeming, seems, seen, sees, self, several, shall, shan't, she, she'd, she'll, she's, should, shouldn't, show, showed, showing, shows, side, sides, since, small, smaller, smallest, so, some, somebody, someone, something, somewhat, somewhere, state, states, still, still, such, suchlike, sundry, sure, take, taken, than, that, that's, the, thee, their, theirs, them, themselves, then, there, there's, therefore, these, they, they'd, they'll, they're, they've, thine, thing, things, think, thinks, this, those, thou, though, thought, thoughts, three, through, throughout, thus, thyself, till, to, today, together, too, took, tother, toward, towards, turn, turned, turning, turns, twain, two, under, underneath, unless, unlike, until, up, upon, us, use, used, uses, various, versus, very, via, vis-a-vis, want, wanted, wanting, wants, was, wasn't, way, ways, we, we'd, we'll, we're, we've, well, wells, went, were, weren't, what, what's, whatall, whatever, whatsoever, when, when's, where, where's, whereas, wherewith, wherewithal, whether, which, whichever, whichsoever, while, who, who's, whoever, whole, whom, whomever, whomso, whomsoever, whose, whosoever, why, why's, will, with, within, without, won't, work, worked, working, works, worth, would, wouldn't, ye, year, years, yet, yon, yonder, you, you'd, you'll, you're, you've, you-all, young, younger, youngest, your, yours, yourself, yourselves</em></p>
</div>
</div></div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
SyntaxHighlighter.all();
</script>
</body>
</html>

@ -1 +0,0 @@
<meta http-equiv="refresh" content="0; url=html/pattern.html" />

@ -1,65 +0,0 @@
/**
* SyntaxHighlighter
* http://alexgorbatchev.com/SyntaxHighlighter
*
* SyntaxHighlighter is donationware. If you are using it, please donate.
* http://alexgorbatchev.com/SyntaxHighlighter/donate.html
*
* @version
* 3.0.83 (July 02 2010)
*
* @copyright
* Copyright (C) 2004-2010 Alex Gorbatchev.
*
* @license
* Dual licensed under the MIT and GPL licenses.
*/
;(function()
{
// CommonJS
typeof(require) != 'undefined' ? SyntaxHighlighter = require('shCore').SyntaxHighlighter : null;
function Brush()
{
var keywords1 = 'break case catch continue ' +
'default delete do else ' +
'for function if in instanceof ' +
'new return switch ' +
'throw try typeof var while with'
;
var keywords2 = 'false true null super this';
var keywords3 = 'alert back blur close confirm focus forward home' +
'name navigate onblur onerror onfocus onload onmove' +
'onresize onunload open print prompt scroll status stop';
var r = SyntaxHighlighter.regexLib;
this.regexList = [
{ regex: r.multiLineDoubleQuotedString, css: 'string' }, // double quoted strings
{ regex: r.multiLineSingleQuotedString, css: 'string' }, // single quoted strings
{ regex: r.singleLineCComments, css: 'comments1' }, // one line comments
{ regex: r.multiLineCComments, css: 'comments2' }, // multiline comments
{ regex: /\s*#.*/gm, css: 'preprocessor' }, // preprocessor tags like #region and #endregion
{ regex: /function ([^\()]+)\(/g, func: function(match, r) {
return [
new SyntaxHighlighter.Match("function ", match.index, "keyword1"),
new SyntaxHighlighter.Match(match[1], match.index+9, "name")
]; } },
{ regex: new RegExp(this.getKeywords(keywords1), 'gm'), css: 'keyword1' }, // keywords 1
{ regex: new RegExp(this.getKeywords(keywords2), 'gm'), css: 'keyword2' }, // keywords 2
{ regex: new RegExp(this.getKeywords(keywords3), 'gm'), css: 'keyword3' } // keywords 3
];
this.forHtmlScript(r.scriptScriptTags);
};
Brush.prototype = new SyntaxHighlighter.Highlighter();
Brush.aliases = ['js', 'jscript', 'javascript'];
SyntaxHighlighter.brushes.JScript = Brush;
// CommonJS
typeof(exports) != 'undefined' ? exports.Brush = Brush : null;
})();

@ -1,73 +0,0 @@
/**
* SyntaxHighlighter
* http://alexgorbatchev.com/SyntaxHighlighter
*
* SyntaxHighlighter is donationware. If you are using it, please donate.
* http://alexgorbatchev.com/SyntaxHighlighter/donate.html
*
* @version
* 3.0.83 (July 02 2010)
*
* @copyright
* Copyright (C) 2004-2010 Alex Gorbatchev.
*
* @license
* Dual licensed under the MIT and GPL licenses.
*/
;(function()
{
// CommonJS
typeof(require) != 'undefined' ? SyntaxHighlighter = require('shCore').SyntaxHighlighter : null;
function Brush()
{
// Contributed by Gheorghe Milas and Ahmad Sherif
var keywords = 'and assert break class continue def del elif else ' +
'except exec finally for from global if import in is ' +
'lambda not or pass print raise return try yield while';
var funcs = '__import__ abs all any apply basestring bin bool buffer callable ' +
'chr classmethod cmp coerce compile complex delattr dict dir ' +
'divmod enumerate eval execfile file filter float format frozenset ' +
'getattr globals hasattr hash help hex id input int intern ' +
'isinstance issubclass iter len list locals long map max min next ' +
'object oct open ord pow property range raw_input reduce ' +
'reload repr reversed round set setattr slice sorted staticmethod ' +
'str sum super tuple type type unichr unicode vars xrange zip';
var special = 'None True False self cls class_';
this.regexList = [
{ regex: SyntaxHighlighter.regexLib.singleLinePerlComments, css: 'comments1' },
{ regex: /^\s*@\w+/gm, css: 'decorator' },
{ regex: /(['\"]{3})([^\1])*?\1/gm, css: 'comments2' },
{ regex: /"(?!")(?:\.|\\\"|[^\""\n])*"/gm, css: 'string' },
{ regex: /'(?!')(?:\.|(\\\')|[^\''\n])*'/gm, css: 'string' },
{ regex: /\b\d+\.?\w*/g, css: 'value' },
{ regex: /def ([^\()]+)\(/g, func: function(match, r) {
return [
new SyntaxHighlighter.Match("def ", match.index, "keyword"),
new SyntaxHighlighter.Match(match[1], match.index+4, "name")
]; } },
{ regex: /class ([0-9a-zA-Z]+)(\(|:)/g, func: function(match, r) {
return [
new SyntaxHighlighter.Match("class ", match.index, "keyword"),
new SyntaxHighlighter.Match(match[1], match.index+6, "name")
]; } },
{ regex: new RegExp(this.getKeywords(funcs), 'gmi'), css: 'functions' },
{ regex: new RegExp(this.getKeywords(keywords), 'gm'), css: 'keyword' },
{ regex: new RegExp(this.getKeywords(special), 'gm'), css: 'color1' }
];
this.forHtmlScript(SyntaxHighlighter.regexLib.aspScriptTags);
};
Brush.prototype = new SyntaxHighlighter.Highlighter();
Brush.aliases = ['py', 'python'];
SyntaxHighlighter.brushes.Python = Brush;
// CommonJS
typeof(exports) != 'undefined' ? exports.Brush = Brush : null;
})();

@ -1,69 +0,0 @@
/**
* SyntaxHighlighter
* http://alexgorbatchev.com/SyntaxHighlighter
*
* SyntaxHighlighter is donationware. If you are using it, please donate.
* http://alexgorbatchev.com/SyntaxHighlighter/donate.html
*
* @version
* 3.0.83 (July 02 2010)
*
* @copyright
* Copyright (C) 2004-2010 Alex Gorbatchev.
*
* @license
* Dual licensed under the MIT and GPL licenses.
*/
;(function()
{
// CommonJS
typeof(require) != 'undefined' ? SyntaxHighlighter = require('shCore').SyntaxHighlighter : null;
function Brush()
{
function process(match, regexInfo)
{
var constructor = SyntaxHighlighter.Match,
code = match[0],
tag = new XRegExp('(&lt;|<)[\\s\\/\\?]*(?<name>[:\\w-\\.]+)', 'xg').exec(code),
result = []
;
if (match.attributes != null)
{
var attributes,
regex = new XRegExp('(?<name> [\\w:\\-\\.]+)' +
'\\s*=\\s*' +
'(?<value> ".*?"|\'.*?\'|\\w+)',
'xg');
while ((attributes = regex.exec(code)) != null)
{
result.push(new constructor(attributes.name, match.index + attributes.index, 'color1'));
result.push(new constructor(attributes.value, match.index + attributes.index + attributes[0].indexOf(attributes.value), 'string'));
}
}
if (tag != null)
result.push(
new constructor(tag.name, match.index + tag[0].indexOf(tag.name), 'keyword')
);
return result;
}
this.regexList = [
{ regex: new XRegExp('(\\&lt;|<)\\!\\[[\\w\\s]*?\\[(.|\\s)*?\\]\\](\\&gt;|>)', 'gm'), css: 'color2' }, // <![ ... [ ... ]]>
{ regex: SyntaxHighlighter.regexLib.xmlComments, css: 'comments' }, // <!-- ... -->
{ regex: new XRegExp('(&lt;|<)[\\s\\/\\?]*(\\w+)(?<attributes>.*?)[\\s\\/\\?]*(&gt;|>)', 'sg'), func: process }
];
};
Brush.prototype = new SyntaxHighlighter.Highlighter();
Brush.aliases = ['xml', 'xhtml', 'xslt', 'html'];
SyntaxHighlighter.brushes.Xml = Brush;
// CommonJS
typeof(exports) != 'undefined' ? exports.Brush = Brush : null;
})();

@ -1,226 +0,0 @@
/**
* SyntaxHighlighter
* http://alexgorbatchev.com/SyntaxHighlighter
*
* SyntaxHighlighter is donationware. If you are using it, please donate.
* http://alexgorbatchev.com/SyntaxHighlighter/donate.html
*
* @version
* 3.0.83 (July 02 2010)
*
* @copyright
* Copyright (C) 2004-2010 Alex Gorbatchev.
*
* @license
* Dual licensed under the MIT and GPL licenses.
*/
.syntaxhighlighter a,
.syntaxhighlighter div,
.syntaxhighlighter code,
.syntaxhighlighter table,
.syntaxhighlighter table td,
.syntaxhighlighter table tr,
.syntaxhighlighter table tbody,
.syntaxhighlighter table thead,
.syntaxhighlighter table caption,
.syntaxhighlighter textarea {
-moz-border-radius: 0 0 0 0 !important;
-webkit-border-radius: 0 0 0 0 !important;
background: none !important;
border: 0 !important;
bottom: auto !important;
float: none !important;
height: auto !important;
left: auto !important;
line-height: 1.1em !important;
margin: 0 !important;
outline: 0 !important;
overflow: visible !important;
padding: 0 !important;
position: static !important;
right: auto !important;
text-align: left !important;
top: auto !important;
vertical-align: baseline !important;
width: auto !important;
box-sizing: content-box !important;
font-family: "Consolas", "Bitstream Vera Sans Mono", "Courier New", Courier, monospace !important;
font-weight: normal !important;
font-style: normal !important;
font-size: 1em !important;
min-height: inherit !important;
min-height: auto !important;
}
.syntaxhighlighter {
width: 100% !important;
margin: 1em 0 1em 0 !important;
position: relative !important;
overflow: auto !important;
font-size: 1em !important;
}
.syntaxhighlighter.source {
overflow: hidden !important;
}
.syntaxhighlighter .bold {
font-weight: bold !important;
}
.syntaxhighlighter .italic {
font-style: italic !important;
}
.syntaxhighlighter .line {
white-space: pre !important;
}
.syntaxhighlighter table {
width: 100% !important;
}
.syntaxhighlighter table caption {
text-align: left !important;
padding: .5em 0 0.5em 1em !important;
}
.syntaxhighlighter table td.code {
width: 100% !important;
}
.syntaxhighlighter table td.code .container {
position: relative !important;
}
.syntaxhighlighter table td.code .container textarea {
box-sizing: border-box !important;
position: absolute !important;
left: 0 !important;
top: 0 !important;
width: 100% !important;
height: 100% !important;
border: none !important;
background: white !important;
padding-left: 1em !important;
overflow: hidden !important;
white-space: pre !important;
}
.syntaxhighlighter table td.gutter .line {
text-align: right !important;
padding: 0 0.5em 0 1em !important;
}
.syntaxhighlighter table td.code .line {
padding: 0 1em !important;
}
.syntaxhighlighter.nogutter td.code .container textarea, .syntaxhighlighter.nogutter td.code .line {
padding-left: 0em !important;
}
.syntaxhighlighter.show {
display: block !important;
}
.syntaxhighlighter.collapsed table {
display: none !important;
}
.syntaxhighlighter.collapsed .toolbar {
padding: 0.1em 0.8em 0em 0.8em !important;
font-size: 1em !important;
position: static !important;
width: auto !important;
height: auto !important;
}
.syntaxhighlighter.collapsed .toolbar span {
display: inline !important;
margin-right: 1em !important;
}
.syntaxhighlighter.collapsed .toolbar span a {
padding: 0 !important;
display: none !important;
}
.syntaxhighlighter.collapsed .toolbar span a.expandSource {
display: inline !important;
}
.syntaxhighlighter .toolbar {
position: absolute !important;
right: 1px !important;
top: 1px !important;
width: 11px !important;
height: 11px !important;
font-size: 10px !important;
z-index: 10 !important;
}
.syntaxhighlighter .toolbar span.title {
display: inline !important;
}
.syntaxhighlighter .toolbar a {
display: block !important;
text-align: center !important;
text-decoration: none !important;
padding-top: 1px !important;
}
.syntaxhighlighter .toolbar a.expandSource {
display: none !important;
}
.syntaxhighlighter.ie {
font-size: .9em !important;
padding: 1px 0 1px 0 !important;
}
.syntaxhighlighter.ie .toolbar {
line-height: 8px !important;
}
.syntaxhighlighter.ie .toolbar a {
padding-top: 0px !important;
}
.syntaxhighlighter.printing .line.alt1 .content,
.syntaxhighlighter.printing .line.alt2 .content,
.syntaxhighlighter.printing .line.highlighted .number,
.syntaxhighlighter.printing .line.highlighted.alt1 .content,
.syntaxhighlighter.printing .line.highlighted.alt2 .content {
background: none !important;
}
.syntaxhighlighter.printing .line .number {
color: #bbbbbb !important;
}
.syntaxhighlighter.printing .line .content {
color: black !important;
}
.syntaxhighlighter.printing .toolbar {
display: none !important;
}
.syntaxhighlighter.printing a {
text-decoration: none !important;
}
.syntaxhighlighter.printing .plain, .syntaxhighlighter.printing .plain a {
color: black !important;
}
.syntaxhighlighter.printing .comments, .syntaxhighlighter.printing .comments a {
color: #008200 !important;
}
.syntaxhighlighter.printing .string, .syntaxhighlighter.printing .string a {
color: blue !important;
}
.syntaxhighlighter.printing .keyword {
color: #006699 !important;
font-weight: bold !important;
}
.syntaxhighlighter.printing .preprocessor {
color: gray !important;
}
.syntaxhighlighter.printing .variable {
color: #aa7700 !important;
}
.syntaxhighlighter.printing .value {
color: #009900 !important;
}
.syntaxhighlighter.printing .functions {
color: #ff1493 !important;
}
.syntaxhighlighter.printing .constants {
color: #0066cc !important;
}
.syntaxhighlighter.printing .script {
font-weight: bold !important;
}
.syntaxhighlighter.printing .color1, .syntaxhighlighter.printing .color1 a {
color: gray !important;
}
.syntaxhighlighter.printing .color2, .syntaxhighlighter.printing .color2 a {
color: #ff1493 !important;
}
.syntaxhighlighter.printing .color3, .syntaxhighlighter.printing .color3 a {
color: red !important;
}
.syntaxhighlighter.printing .break, .syntaxhighlighter.printing .break a {
color: black !important;
}

File diff suppressed because one or more lines are too long

@ -1,117 +0,0 @@
/**
* SyntaxHighlighter
* http://alexgorbatchev.com/SyntaxHighlighter
*
* SyntaxHighlighter is donationware. If you are using it, please donate.
* http://alexgorbatchev.com/SyntaxHighlighter/donate.html
*
* @version
* 3.0.83 (July 02 2010)
*
* @copyright
* Copyright (C) 2004-2010 Alex Gorbatchev.
*
* @license
* Dual licensed under the MIT and GPL licenses.
*/
.syntaxhighlighter {
background-color: white !important;
}
.syntaxhighlighter .line.alt1 {
background-color: white !important;
}
.syntaxhighlighter .line.alt2 {
background-color: white !important;
}
.syntaxhighlighter .line.highlighted.alt1, .syntaxhighlighter .line.highlighted.alt2 {
background-color: #e0e0e0 !important;
}
.syntaxhighlighter .line.highlighted.number {
color: black !important;
}
.syntaxhighlighter table caption {
color: black !important;
}
.syntaxhighlighter .gutter {
color: #afafaf !important;
}
.syntaxhighlighter .gutter .line {
border-right: 3px solid #6ce26c !important;
}
.syntaxhighlighter .gutter .line.highlighted {
background-color: #6ce26c !important;
color: white !important;
}
.syntaxhighlighter.printing .line .content {
border: none !important;
}
.syntaxhighlighter.collapsed {
overflow: visible !important;
}
.syntaxhighlighter.collapsed .toolbar {
color: blue !important;
background: white !important;
border: 1px solid #6ce26c !important;
}
.syntaxhighlighter.collapsed .toolbar a {
color: blue !important;
}
.syntaxhighlighter.collapsed .toolbar a:hover {
color: red !important;
}
.syntaxhighlighter .toolbar {
color: white !important;
background: #6ce26c !important;
border: none !important;
}
.syntaxhighlighter .toolbar a {
color: white !important;
}
.syntaxhighlighter .toolbar a:hover {
color: black !important;
}
.syntaxhighlighter .plain, .syntaxhighlighter .plain a {
color: black !important;
}
.syntaxhighlighter .comments, .syntaxhighlighter .comments a {
color: #008200 !important;
}
.syntaxhighlighter .string, .syntaxhighlighter .string a {
color: blue !important;
}
.syntaxhighlighter .keyword {
color: #006699 !important;
}
.syntaxhighlighter .preprocessor {
color: gray !important;
}
.syntaxhighlighter .variable {
color: #aa7700 !important;
}
.syntaxhighlighter .value {
color: #009900 !important;
}
.syntaxhighlighter .functions {
color: #ff1493 !important;
}
.syntaxhighlighter .constants {
color: #0066cc !important;
}
.syntaxhighlighter .script {
font-weight: bold !important;
color: #006699 !important;
background-color: none !important;
}
.syntaxhighlighter .color1, .syntaxhighlighter .color1 a {
color: gray !important;
}
.syntaxhighlighter .color2, .syntaxhighlighter .color2 a {
color: #ff1493 !important;
}
.syntaxhighlighter .color3, .syntaxhighlighter .color3 a {
color: red !important;
}
.syntaxhighlighter .keyword {
font-weight: bold !important;
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save